[libclc] r219217 - Add AMD OpenCL builtins

Tue Oct 7 10:10:49 PDT 2014

Added: libclc/branches/amd-builtins/amd-builtins/media/media.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/media.h?rev=219217&view=auto
==============================================================================

--- libclc/branches/amd-builtins/amd-builtins/media/media.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/media.h Tue Oct  7 12:10:46 2014
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_amd_media_ops : enable
+
+extern __attribute__((const)) uint __hsail_bitalign_b32(uint, uint, uint);
+
+extern __attribute__((const)) uint __hsail_bytealign_b32(uint, uint, uint);
+
+extern __attribute__((pure)) uint  __hsail_packcvt_u8x4_f32(float,float,float,float);
+
+extern __attribute__((pure)) uint __hsail_lerp_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) uint __hsail_sad_u32_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) uint __hsail_sadhi_u16x2_u8x4(uint,uint,uint);
+
+extern __attribute__((pure)) float __hsail_unpackcvt_f32_u8x4(uint,uint);
+
+extern __attribute__((const)) uint __hsail_msad(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_sadd(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_sadw(uint,uint,uint);
+
+extern __attribute__((const)) uint __hsail_umin3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imin3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_umax3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imax3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_umedian3(uint,uint,uint);
+
+extern __attribute__((const)) int __hsail_imedian3(int,int,int);
+
+extern __attribute__((const)) uint __hsail_bfe(uint,uint,uint);
+
+extern __attribute__((const)) float __hsail_f32_min3(float,float,float);
+
+extern __attribute__((const)) float __hsail_f32_max3(float,float,float);
+
+extern __attribute__((const)) float __hsail_f32_median3(float,float,float);
+
+extern __attribute__((const)) ulong __hsail_mqsad(ulong,uint,ulong);
+
+extern __attribute__((const)) ulong __hsail_qsad(ulong,uint,ulong);
+
+extern __attribute__((const)) uint __hsail_bfm(uint,uint);
+
+extern __attribute__((const)) int __hsail_ibfe(int,uint,uint);

Added: libclc/branches/amd-builtins/amd-builtins/media/median3.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/median3.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/median3.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/median3.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_median3(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_umedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umedian3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_median3(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_umedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umedian3(v1.z,v2.z, v3.z);
+    return ret;
+}
+
+__attribute__((overloadable,always_inline,const)) uint4 amd_median3(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_umedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umedian3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_umedian3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_median3(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_umedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umedian3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_median3(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_umedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umedian3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_umedian3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_umedian3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_umedian3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_umedian3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_umedian3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_umedian3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_umedian3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_umedian3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_median3(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_umedian3(v1,v2,v3) ;
+}
+__attribute__((overloadable,always_inline,const)) float2 amd_median3(float2 v1, float2 v2, float2 v3) 
+{
+    float2 ret;
+    ret.x =  __hsail_f32_median3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_median3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float3 amd_median3(float3 v1, float3 v2, float3 v3) 
+{
+    float3 ret;
+    ret.x =  __hsail_f32_median3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_median3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_median3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float4 amd_median3(float4 v1, float4 v2, float4 v3) 
+{
+    float4 ret;
+    ret.x =  __hsail_f32_median3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_median3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_median3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_f32_median3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float8 amd_median3(float8 v1, float8 v2, float8 v3) 
+{
+    float8 ret;
+    ret.s0 =  __hsail_f32_median3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_median3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_median3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_median3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_median3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_median3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_median3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float16 amd_median3(float16 v1, float16 v2, float16 v3) 
+{
+    float16 ret;
+    ret.s0 =  __hsail_f32_median3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_median3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_median3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_median3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_median3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_median3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_median3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_median3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_f32_median3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_f32_median3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_f32_median3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_f32_median3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_f32_median3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_f32_median3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_f32_median3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_f32_median3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float amd_median3(float v1, float v2, float v3) 
+{
+    return  __hsail_f32_median3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_median3(int2 v1, int2 v2, int2 v3)
+{
+    int2 ret;
+    ret.x =  __hsail_imedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imedian3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_median3(int3 v1, int3 v2, int3 v3)
+{
+    int3 ret;
+    ret.x =  __hsail_imedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imedian3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_median3(int4 v1, int4 v2, int4 v3)
+{
+    int4 ret;
+    ret.x =  __hsail_imedian3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imedian3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imedian3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_imedian3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_median3(int8 v1, int8 v2, int8 v3)
+{
+    int8 ret;
+    ret.s0 =  __hsail_imedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imedian3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_median3(int16 v1, int16 v2, int16 v3)
+{
+    int16 ret;
+    ret.s0 =  __hsail_imedian3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imedian3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imedian3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imedian3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imedian3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imedian3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imedian3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imedian3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_imedian3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_imedian3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_imedian3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_imedian3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_imedian3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_imedian3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_imedian3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_imedian3(v1.sf,v2.sf,v3.sf);
+    
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_median3(int v1, int v2, int v3)
+{
+    return  __hsail_imedian3(v1,v2,v3);
+}

Added: libclc/branches/amd-builtins/amd-builtins/media/min3.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/min3.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/min3.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/min3.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,215 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_min3(uint2 v1, uint2 v2, uint2 v3)
+{
+    uint2 ret;
+    ret.x =  __hsail_umin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umin3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_min3(uint3 v1, uint3 v2, uint3 v3)
+{
+    uint3 ret;
+    ret.x =  __hsail_umin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umin3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_min3(uint4 v1, uint4 v2, uint4 v3)
+{
+    uint4 ret;
+    ret.x =  __hsail_umin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_umin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_umin3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_umin3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_min3(uint8 v1, uint8 v2, uint8 v3)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_umin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umin3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_min3(uint16 v1, uint16 v2, uint16 v3)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_umin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_umin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_umin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_umin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_umin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_umin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_umin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_umin3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_umin3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_umin3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_umin3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_umin3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_umin3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_umin3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_umin3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_umin3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_min3(uint v1, uint v2, uint v3)
+{
+    return  __hsail_umin3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) float2 amd_min3(float2 v1, float2 v2, float2 v3)
+{
+    float2 ret;
+    ret.x =  __hsail_f32_min3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_min3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float3 amd_min3(float3 v1, float3 v2, float3 v3)
+{
+    float3 ret;
+    ret.x =  __hsail_f32_min3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_min3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_min3(v1.z,v2.z, v3.z);
+    return ret;
+}
+
+__attribute__((overloadable,always_inline,const)) float4 amd_min3(float4 v1, float4 v2, float4 v3)
+{
+    float4 ret;
+    ret.x =  __hsail_f32_min3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_f32_min3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_f32_min3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_f32_min3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float8 amd_min3(float8 v1, float8 v2, float8 v3)
+{
+    float8 ret;
+    ret.s0 =  __hsail_f32_min3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_min3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_min3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_min3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_min3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_min3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_min3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float16 amd_min3(float16 v1, float16 v2, float16 v3)
+{
+    float16 ret;
+    ret.s0 =  __hsail_f32_min3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_f32_min3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_f32_min3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_f32_min3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_f32_min3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_f32_min3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_f32_min3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_f32_min3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_f32_min3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_f32_min3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_f32_min3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_f32_min3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_f32_min3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_f32_min3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_f32_min3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_f32_min3(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) float amd_min3(float v1, float v2, float v3)
+{
+    return  __hsail_f32_min3(v1,v2,v3);
+}
+__attribute__((overloadable,always_inline,const)) int2 amd_min3(int2 v1, int2 v2, int2 v3)
+{
+    int2 ret;
+    ret.x =  __hsail_imin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imin3(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int3 amd_min3(int3 v1, int3 v2, int3 v3)
+{
+    int3 ret;
+    ret.x =  __hsail_imin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imin3(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int4 amd_min3(int4 v1, int4 v2, int4 v3)
+{
+    int4 ret;
+    ret.x =  __hsail_imin3(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_imin3(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_imin3(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_imin3(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int8 amd_min3(int8 v1, int8 v2, int8 v3)
+{
+    int8 ret;
+    ret.s0 =  __hsail_imin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imin3(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int16 amd_min3(int16 v1, int16 v2, int16 v3)
+{
+    int16 ret;
+    ret.s0 =  __hsail_imin3(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_imin3(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_imin3(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_imin3(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_imin3(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_imin3(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_imin3(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_imin3(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_imin3(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_imin3(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_imin3(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_imin3(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_imin3(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_imin3(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_imin3(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_imin3(v1.sf,v2.sf,v3.sf);
+    
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) int amd_min3(int v1, int v2, int v3)
+{
+    return  __hsail_imin3(v1,v2,v3);
+}

Added: libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/mqsad.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) ulong2 amd_mqsad(ulong2 v1, uint2 v2, ulong2 v3) 
+{
+    ulong2 ret;
+    ret.x =  __hsail_mqsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_mqsad(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong3 amd_mqsad(ulong3 v1, uint3 v2, ulong3 v3) 
+{
+    ulong3 ret;
+    ret.x =  __hsail_mqsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_mqsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_mqsad(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong4 amd_mqsad(ulong4 v1, uint4 v2, ulong4 v3) 
+{
+    ulong4 ret;
+    ret.x =  __hsail_mqsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_mqsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_mqsad(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_mqsad(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong8 amd_mqsad(ulong8 v1, uint8 v2, ulong8 v3) 
+{
+    ulong8 ret;
+    ret.s0 =  __hsail_mqsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_mqsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_mqsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_mqsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_mqsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_mqsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_mqsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_mqsad(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong16 amd_mqsad(ulong16 v1, uint16 v2, ulong16 v3) 
+{
+    ulong16 ret;
+    ret.s0 =  __hsail_mqsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_mqsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_mqsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_mqsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_mqsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_mqsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_mqsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_mqsad(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_mqsad(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_mqsad(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_mqsad(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_mqsad(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_mqsad(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_mqsad(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_mqsad(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_mqsad(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong amd_mqsad(ulong v1, uint v2, ulong v3) 
+{
+    return  __hsail_mqsad(v1,v2,v3);
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/media/msad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/msad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/msad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/msad.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_msad(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_msad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_msad(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_msad(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_msad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_msad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_msad(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_msad(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_msad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_msad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_msad(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_msad(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_msad(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_msad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_msad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_msad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_msad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_msad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_msad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_msad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_msad(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_msad(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_msad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_msad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_msad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_msad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_msad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_msad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_msad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_msad(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_msad(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_msad(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_msad(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_msad(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_msad(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_msad(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_msad(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_msad(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_msad(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_msad(v1,v2,v3);
+}

Added: libclc/branches/amd-builtins/amd-builtins/media/pack.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/pack.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/pack.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/pack.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,33 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+#ifdef __clang__
+__attribute__((overloadable, always_inline))
+#else
+__attribute__((always_inline))
+#endif
+ uint amd_pack(float4 v)
+{
+    return __hsail_packcvt_u8x4_f32(v.s0,v.s1,v.s2,v.s3);
+}

Added: libclc/branches/amd-builtins/amd-builtins/media/qsad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/qsad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/qsad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/qsad.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,89 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) ulong2 amd_qsad(ulong2 v1, uint2 v2, ulong2 v3) 
+{
+    ulong2 ret;
+    ret.x =  __hsail_qsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_qsad(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong3 amd_qsad(ulong3 v1, uint3 v2, ulong3 v3) 
+{
+    ulong3 ret;
+    ret.x =  __hsail_qsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_qsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_qsad(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong4 amd_qsad(ulong4 v1, uint4 v2, ulong4 v3) 
+{
+    ulong4 ret;
+    ret.x =  __hsail_qsad(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_qsad(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_qsad(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_qsad(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong8 amd_qsad(ulong8 v1, uint8 v2, ulong8 v3) 
+{
+    ulong8 ret;
+    ret.s0 =  __hsail_qsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_qsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_qsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_qsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_qsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_qsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_qsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_qsad(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong16 amd_qsad(ulong16 v1, uint16 v2, ulong16 v3) 
+{
+    ulong16 ret;
+    ret.s0 =  __hsail_qsad(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_qsad(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_qsad(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_qsad(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_qsad(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_qsad(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_qsad(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_qsad(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_qsad(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_qsad(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_qsad(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_qsad(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_qsad(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_qsad(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_qsad(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_qsad(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) ulong amd_qsad(ulong v1, uint v2, ulong v3) 
+{
+    return  __hsail_qsad(v1,v2,v3);
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/media/sad.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sad.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sad.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sad.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_sad(uint a, uint b, uint c)
+{
+    return __hsail_sad_u32_u8x4(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_sad(uint2 a, uint2 b, uint2 c)
+{
+    uint2 ret;
+    ret.x =  __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_sad(uint3 a, uint3 b, uint3 c)
+{
+
+    uint3 ret;
+    ret.x =  __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sad_u32_u8x4(a.z, b.z, c.z);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_sad(uint4 a, uint4 b, uint4 c)
+{
+    uint4 ret;
+    ret.x =  __hsail_sad_u32_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sad_u32_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sad_u32_u8x4(a.z, b.z, c.z);
+    ret.w =  __hsail_sad_u32_u8x4(a.w, b.w, c.w);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_sad(uint8 a, uint8 b, uint8 c)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_sad(uint16 a, uint16 b, uint16 c)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sad_u32_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sad_u32_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sad_u32_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sad_u32_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sad_u32_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sad_u32_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sad_u32_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sad_u32_u8x4(a.s7, b.s7, c.s7);
+    ret.s8 =  __hsail_sad_u32_u8x4(a.s8, b.s8, c.s8);
+    ret.s9 =  __hsail_sad_u32_u8x4(a.s9, b.s9, c.s9);
+    ret.sa =  __hsail_sad_u32_u8x4(a.sa, b.sa, c.sa);
+    ret.sb =  __hsail_sad_u32_u8x4(a.sb, b.sb, c.sb);
+    ret.sc =  __hsail_sad_u32_u8x4(a.sc, b.sc, c.sc);
+    ret.sd =  __hsail_sad_u32_u8x4(a.sd, b.sd, c.sd);
+    ret.se =  __hsail_sad_u32_u8x4(a.se, b.se, c.se);
+    ret.sf =  __hsail_sad_u32_u8x4(a.sf, b.sf, c.sf);
+    return ret;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/media/sad4.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sad4.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sad4.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sad4.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,37 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+#ifdef __clang__
+__attribute__((overloadable, always_inline))
+#else
+__attribute__((always_inline))
+#endif
+uint amd_sad4(uint4 x, uint4 y, uint z)
+{
+    uint a = __hsail_sad_u32_u8x4(x.s0,y.s0,z);
+    a =  __hsail_sad_u32_u8x4(x.s1,y.s1,a);
+    a =  __hsail_sad_u32_u8x4(x.s2,y.s2,a);
+
+    return  __hsail_sad_u32_u8x4(x.s3,y.s3,a);
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/media/sadd.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sadd.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sadd.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sadd.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_sadd(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_sadd(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadd(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_sadd(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_sadd(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadd(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadd(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_sadd(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_sadd(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadd(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadd(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_sadd(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_sadd(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sadd(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadd(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadd(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadd(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadd(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_sadd(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadd(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadd(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_sadd(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sadd(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadd(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadd(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadd(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadd(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_sadd(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadd(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadd(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_sadd(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_sadd(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_sadd(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_sadd(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_sadd(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_sadd(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_sadd(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_sadd(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_sadd(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_sadd(v1,v2,v3);
+}

Added: libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sadhi.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) uint
+amd_sadhi(uint a, uint b, uint c)
+{
+    return __hsail_sadhi_u16x2_u8x4(a, b, c);
+}
+
+__attribute__((overloadable, always_inline)) uint2
+amd_sadhi(uint2 a, uint2 b, uint2 c)
+{
+    uint2 ret;
+    ret.x =  __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint3
+amd_sadhi(uint3 a, uint3 b, uint3 c)
+{
+
+    uint3 ret;
+    ret.x =  __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) uint4
+amd_sadhi(uint4 a, uint4 b, uint4 c)
+{
+    uint4 ret;
+    ret.x =  __hsail_sadhi_u16x2_u8x4(a.x, b.x, c.x);
+    ret.y =  __hsail_sadhi_u16x2_u8x4(a.y, b.y, c.y);
+    ret.z =  __hsail_sadhi_u16x2_u8x4(a.z, b.z, c.z);
+    ret.w =  __hsail_sadhi_u16x2_u8x4(a.w, b.w, c.w);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint8
+amd_sadhi(uint8 a, uint8 b, uint8 c)
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) uint16
+amd_sadhi(uint16 a, uint16 b, uint16 c)
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sadhi_u16x2_u8x4(a.s0, b.s0, c.s0);
+    ret.s1 =  __hsail_sadhi_u16x2_u8x4(a.s1, b.s1, c.s1);
+    ret.s2 =  __hsail_sadhi_u16x2_u8x4(a.s2, b.s2, c.s2);
+    ret.s3 =  __hsail_sadhi_u16x2_u8x4(a.s3, b.s3, c.s3);
+    ret.s4 =  __hsail_sadhi_u16x2_u8x4(a.s4, b.s4, c.s4);
+    ret.s5 =  __hsail_sadhi_u16x2_u8x4(a.s5, b.s5, c.s5);
+    ret.s6 =  __hsail_sadhi_u16x2_u8x4(a.s6, b.s6, c.s6);
+    ret.s7 =  __hsail_sadhi_u16x2_u8x4(a.s7, b.s7, c.s7);
+    ret.s8 =  __hsail_sadhi_u16x2_u8x4(a.s8, b.s8, c.s8);
+    ret.s9 =  __hsail_sadhi_u16x2_u8x4(a.s9, b.s9, c.s9);
+    ret.sa =  __hsail_sadhi_u16x2_u8x4(a.sa, b.sa, c.sa);
+    ret.sb =  __hsail_sadhi_u16x2_u8x4(a.sb, b.sb, c.sb);
+    ret.sc =  __hsail_sadhi_u16x2_u8x4(a.sc, b.sc, c.sc);
+    ret.sd =  __hsail_sadhi_u16x2_u8x4(a.sd, b.sd, c.sd);
+    ret.se =  __hsail_sadhi_u16x2_u8x4(a.se, b.se, c.se);
+    ret.sf =  __hsail_sadhi_u16x2_u8x4(a.sf, b.sf, c.sf);
+    return ret;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/media/sadw.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/sadw.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/sadw.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/sadw.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include "media.h"
+
+#pragma OPENCL EXTENSION cl_amd_media_ops2 : enable
+
+__attribute__((overloadable,always_inline,const)) uint2 amd_sadw(uint2 v1, uint2 v2, uint2 v3) 
+{
+    uint2 ret;
+    ret.x =  __hsail_sadw(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadw(v1.y,v2.y,v3.y);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint3 amd_sadw(uint3 v1, uint3 v2, uint3 v3) 
+{
+    uint3 ret;
+    ret.x =  __hsail_sadw(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadw(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadw(v1.z,v2.z, v3.z);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint4 amd_sadw(uint4 v1, uint4 v2, uint4 v3) 
+{
+    uint4 ret;
+    ret.x =  __hsail_sadw(v1.x,v2.x, v3.x);
+    ret.y =  __hsail_sadw(v1.y,v2.y,v3.y);
+    ret.z =  __hsail_sadw(v1.z,v2.z, v3.z);
+    ret.w =  __hsail_sadw(v1.w,v2.w,v3.w);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint8 amd_sadw(uint8 v1, uint8 v2, uint8 v3) 
+{
+    uint8 ret;
+    ret.s0 =  __hsail_sadw(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadw(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadw(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadw(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadw(v1.s4,v2.s4,v3.s4 );
+    ret.s5 =  __hsail_sadw(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadw(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadw(v1.s7,v2.s7,v3.s7);
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint16 amd_sadw(uint16 v1, uint16 v2, uint16 v3) 
+{
+    uint16 ret;
+    ret.s0 =  __hsail_sadw(v1.s0,v2.s0, v3.s0);
+    ret.s1 =  __hsail_sadw(v1.s1,v2.s1,v3.s1);
+    ret.s2 =  __hsail_sadw(v1.s2,v2.s2, v3.s2);
+    ret.s3 =  __hsail_sadw(v1.s3,v2.s3,v3.s3);
+    ret.s4 =  __hsail_sadw(v1.s4,v2.s4,v3.s4) ;
+    ret.s5 =  __hsail_sadw(v1.s5,v2.s5,v3.s5);
+    ret.s6 =  __hsail_sadw(v1.s6,v2.s6,v3.s6 );
+    ret.s7 =  __hsail_sadw(v1.s7,v2.s7,v3.s7);
+    ret.s8 =  __hsail_sadw(v1.s8,v2.s8,v3.s8 );
+    ret.s9 =  __hsail_sadw(v1.s9,v2.s9,v3.s9);
+    ret.sa =  __hsail_sadw(v1.sa,v2.sa, v3.sa);
+    ret.sb =  __hsail_sadw(v1.sb,v2.sb,v3.sb);
+    ret.sc =  __hsail_sadw(v1.sc,v2.sc, v3.sc);
+    ret.sd =  __hsail_sadw(v1.sd,v2.sd,v3.sd);
+    ret.se =  __hsail_sadw(v1.se,v2.se, v3.se);
+    ret.sf=  __hsail_sadw(v1.sf,v2.sf,v3.sf);
+
+    return ret;
+}
+__attribute__((overloadable,always_inline,const)) uint amd_sadw(uint v1, uint v2, uint v3) 
+{
+    return  __hsail_sadw(v1,v2,v3);
+}

Added: libclc/branches/amd-builtins/amd-builtins/media/unpack.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/media/unpack.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/media/unpack.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/media/unpack.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,327 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "media.h"
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack0(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,0);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack0(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,0);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack0(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,0);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,0);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,0);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack0(uint4 a)
+{
+    float4 ret;
+    ret.x = __hsail_unpackcvt_f32_u8x4(a.x,0);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,0);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,0);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack0(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,0);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,0);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,0);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,0);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,0);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,0);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,0);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack0(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,0);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,0);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,0);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,0);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,0);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,0);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,0);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,0);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,0);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,0);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,0);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,0);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,0);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,0);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,0);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,0);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack1(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,1);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack1(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,1);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack1(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,1);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,1);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,1);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack1(uint4 a)
+{
+    float4 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,1);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,1);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,1);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack1(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,1);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,1);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,1);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,1);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,1);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,1);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,1);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack1(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,1);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,1);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,1);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,1);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,1);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,1);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,1);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,1);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,1);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,1);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,1);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,1);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,1);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,1);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,1);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,1);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack2(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,2);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack2(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,2);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack2(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,2);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,2);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,2);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack2(uint4 a)
+{
+    float4 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,2);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,2);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,2);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack2(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,2);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,2);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,2);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,2);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,2);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,2);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,2);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack2(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,2);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,2);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,2);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,2);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,2);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,2);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,2);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,2);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,2);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,2);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,2);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,2);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,2);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,2);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,2);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,2);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float
+amd_unpack3(uint a)
+{
+    return __hsail_unpackcvt_f32_u8x4(a,3);
+}
+
+__attribute__((overloadable, always_inline)) float2
+amd_unpack3(uint2 a)
+{
+    float2 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,3);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,3);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float3
+amd_unpack3(uint3 a)
+{
+
+    float3  ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,3);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,3);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,3);
+    return ret;
+
+}
+
+__attribute__((overloadable, always_inline)) float4
+amd_unpack3(uint4 a)
+{
+    float4 ret;
+    ret.x =  __hsail_unpackcvt_f32_u8x4(a.x,3);
+    ret.y =  __hsail_unpackcvt_f32_u8x4(a.y,3);
+    ret.z =  __hsail_unpackcvt_f32_u8x4(a.z,3);
+    ret.w =  __hsail_unpackcvt_f32_u8x4(a.w,3);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float8
+amd_unpack3(uint8 a)
+{
+    float8 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,3);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,3);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,3);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,3);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,3);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,3);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,3);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,3);
+    return ret;
+}
+
+__attribute__((overloadable, always_inline)) float16
+amd_unpack3(uint16 a)
+{
+    float16 ret;
+    ret.s0 =  __hsail_unpackcvt_f32_u8x4(a.s0,3);
+    ret.s1 =  __hsail_unpackcvt_f32_u8x4(a.s1,3);
+    ret.s2 =  __hsail_unpackcvt_f32_u8x4(a.s2,3);
+    ret.s3 =  __hsail_unpackcvt_f32_u8x4(a.s3,3);
+    ret.s4 =  __hsail_unpackcvt_f32_u8x4(a.s4,3);
+    ret.s5 =  __hsail_unpackcvt_f32_u8x4(a.s5,3);
+    ret.s6 =  __hsail_unpackcvt_f32_u8x4(a.s6,3);
+    ret.s7 =  __hsail_unpackcvt_f32_u8x4(a.s7,3);
+    ret.s8 =  __hsail_unpackcvt_f32_u8x4(a.s8,3);
+    ret.s9 =  __hsail_unpackcvt_f32_u8x4(a.s9,3);
+    ret.sa =  __hsail_unpackcvt_f32_u8x4(a.sa,3);
+    ret.sb =  __hsail_unpackcvt_f32_u8x4(a.sb,3);
+    ret.sc =  __hsail_unpackcvt_f32_u8x4(a.sc,3);
+    ret.sd =  __hsail_unpackcvt_f32_u8x4(a.sd,3);
+    ret.se =  __hsail_unpackcvt_f32_u8x4(a.se,3);
+    ret.sf =  __hsail_unpackcvt_f32_u8x4(a.sf,3);
+    return ret;
+}

Added: libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/amdil-to-hsail.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,352 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+// __amdil_ to __hsail_ translation  
+
+// HSAIL intrinsic functions used by math32 functions
+extern __attribute__((pure)) float __hsail_fma_f32(float, float, float);
+extern __attribute__((pure)) float __hsail_nfma_f32(float, float, float);
+extern __attribute__((pure)) float __hsail_min_f32(float, float);
+extern __attribute__((pure)) float __hsail_max_f32(float, float);
+extern __attribute__((pure)) float __hsail_ftz_f32(float);
+extern __attribute__((pure)) float __hsail_round_f32(float);
+extern __attribute__((pure)) float __hsail_floor_f32(float);
+extern __attribute__((pure)) float __hsail_ceil_f32(float);
+extern __attribute__((pure)) float __hsail_trunc_f32(float);
+extern __attribute__((pure)) float __hsail_abs_f32(float);
+
+extern __attribute__((pure)) int  __hsail_min_s32(int, int);
+extern __attribute__((pure)) int  __hsail_max_s32(int, int);
+extern __attribute__((pure)) uint __hsail_min_u32(uint, int);
+extern __attribute__((pure)) uint __hsail_max_u32(uint, uint);
+extern __attribute__((pure)) int  __hsail_mulhi_s32(int, int);
+extern __attribute__((pure)) uint __hsail_mulhi_u32(uint, uint);
+extern __attribute__((pure)) int  __hsail_mulhi_s64(int, int);
+extern __attribute__((pure)) uint __hsail_mulhi_u64(uint, uint);
+
+// HSAIL intrinsic functions used by math64 functions
+extern __attribute__((pure)) double __hsail_fma_f64(double, double, double);
+extern __attribute__((pure)) double __hsail_nfma_f64(double, double, double);
+extern __attribute__((pure)) double __hsail_max_f64(double, double);
+extern __attribute__((pure)) double __hsail_min_f64(double, double);
+extern __attribute__((pure)) double __hsail_round_f64(double);
+extern __attribute__((pure)) double __hsail_floor_f64(double);
+extern __attribute__((pure)) double __hsail_ceil_f64(double);
+extern __attribute__((pure)) double __hsail_trunc_f64(double);
+extern __attribute__((pure)) double __hsail_abs_f64(double);
+extern __attribute__((pure)) double __hsail_nrsqrt_f64(double);
+extern __attribute__((pure)) double __hsail_nsqrt_f64(double);
+
+extern __attribute__((pure)) uint __hsail_mad_u32(uint, uint, uint);
+
+// HSAIL conversion intrinsics
+extern __attribute__((pure)) float __cvt_f32_f16(uint op1);
+
+extern __attribute__((pure)) float __cvt_f16_rtz_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rte_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rtn_f32(float op1);
+extern __attribute__((pure)) float __cvt_f16_rtp_f32(float op1);
+
+extern __attribute__((pure)) float __cvt_f16_rtz_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rte_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rtn_f64(double op1);
+extern __attribute__((pure)) float __cvt_f16_rtp_f64(double op1);
+
+// Misc HSAIL intrinsic functions
+extern __attribute__((const)) uint __hsail_bitselect_u32(uint, uint, uint);
+extern __attribute__((pure)) int  __hsail_class_f32(float, int);
+extern __attribute__((pure)) int  __hsail_class_f64(double, int);
+extern __attribute__((pure)) int  __hsail_mad24_s32(int, int, int);
+extern __attribute__((pure)) uint __hsail_mad24_u32(uint, uint, uint);
+extern __attribute__((pure)) int  __hsail_mul24_s32(int, int);
+extern __attribute__((pure)) uint __hsail_mul24_u32(uint, uint);
+
+extern __attribute__((pure)) int __hsail_popcount_u32_b32(int);
+
+extern __attribute__((pure)) int __hsail_firstbit_u32(uint);
+
+extern __attribute__((pure)) float  __hsail_fraction_f32(float);
+extern __attribute__((pure)) double __hsail_fraction_f64(double);
+
+// __amdil_ math32 function defs
+
+__attribute__((weak,always_inline)) float
+__amdil_div_f32(float x, float y) {
+  return native_divide(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_fma_f32(float x, float y, float z) {
+  return __hsail_fma_f32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_mad_f32(float x, float y, float z) {
+  return __hsail_nfma_f32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_min_f32(float x, float y) {
+  return __hsail_min_f32(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_max_f32(float x, float y) {
+  return __hsail_max_f32(x, y);
+}
+
+__attribute__((weak,always_inline)) float
+__ftz_f32(float x) {
+  return __hsail_ftz_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_nearest_f32(float x) {
+  return __hsail_round_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_neginf_f32(float x) {
+  return __hsail_floor_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_posinf_f32(float x) {
+  return __hsail_ceil_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_round_zero_f32(float x) {
+  return __hsail_trunc_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_fabs_f32(float x) {
+  return __hsail_abs_f32(x);
+}
+
+__attribute__((weak,always_inline)) float
+__amdil_improved_div_f32(float x, float y) {
+  return native_divide(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imin_i32(int x, int y) {
+  return __hsail_min_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imax_i32(int x, int y) {
+  return __hsail_max_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umin_u32(uint x, uint y) {
+  return __hsail_min_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umax_u32(uint x, uint y) {
+  return __hsail_max_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_imul_high_i32(int x, int y) {
+  return __hsail_mulhi_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint
+__amdil_umul_high_u32(uint x, uint y) {
+  return __hsail_mulhi_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint 
+__amdil_umad_u32(uint x, uint y, uint z) {
+  return __hsail_mad_u32(x, y, z);
+}
+
+// __amdil_ math64 function defs
+
+__attribute__((weak,always_inline)) double
+__amdil_fma_f64(double x, double y, double z) {
+  return __hsail_fma_f64(x, y, z);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_mad_f64(double x, double y, double z) {
+  return __hsail_nfma_f64(x, y, z);
+}
+
+ __attribute__((weak,always_inline)) double
+__amdil_max_f64(double x, double y) {
+  return __hsail_max_f64(x, y);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_nearest_f64(double x) {
+  return __hsail_round_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_neginf_f64(double x) {
+  return __hsail_floor_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_posinf_f64(double x) {
+  return __hsail_ceil_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_round_zero_f64(double x) {
+  return __hsail_trunc_f64(x);
+}
+
+ __attribute__((weak,always_inline)) double
+__amdil_min_f64(double x, double y) {
+  return __hsail_min_f64(x, y);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_fabs_f64(double x) {
+  return __hsail_abs_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_sqrt_f64(double x) {
+  return __hsail_nsqrt_f64(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_rsq_f64(double x) {
+  return __hsail_nrsqrt_f64(x);
+}
+
+// __amdil conversion functions
+
+__attribute__((weak,always_inline)) float 
+__amdil_half_to_float_f32(uint x) {
+  return __cvt_f32_f16(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_f32(float x) {
+  return __cvt_f16_rtz_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_near_f32(float x) {
+  return __cvt_f16_rte_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_neg_inf_f32(float x) {
+  return __cvt_f16_rtn_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_float_to_half_plus_inf_f32(float x) {
+  return __cvt_f16_rtp_f32(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_f64(double x) {
+  return __cvt_f16_rtz_f64(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_near_f64(double x) {
+  return __cvt_f16_rte_f64(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_neg_inf_f64(double x) {
+  return __cvt_f16_rtn_f64(x);
+}
+
+__attribute__((weak,always_inline)) float 
+__amdil_double_to_half_plus_inf_f64(double x) {
+  return __cvt_f16_rtp_f64(x);
+}
+
+// Misc __amdil_ function defs
+
+__attribute__((weak,always_inline)) uint
+__amdil_bfi_u32(uint x, uint y, uint z) {
+  return __hsail_bitselect_u32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_class_f32(float x, int y) {
+  int cval = __hsail_class_f32(x, y);
+  int ret = (cval & 0x1) ? (0xffffffffU) : 0;
+  return ret;
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_class_f64(double x, int y) {
+  int cval = __hsail_class_f64(x, y);
+  int ret = (cval & 0x1) ? (0xffffffffU) : 0;
+  return ret;
+}
+
+__attribute__((weak,always_inline)) int 
+__amdil_imad24_i32(int x, int y, int z) {
+  return __hsail_mad24_s32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) uint 
+__amdil_umad24_u32(uint x, uint y, uint z) {
+  return __hsail_mad24_u32(x, y, z);
+}
+
+__attribute__((weak,always_inline)) int 
+__amdil_imul24_i32(int x, int y) {
+  return __hsail_mul24_s32(x, y);
+}
+
+__attribute__((weak,always_inline)) uint 
+__amdil_umul24_u32(uint x, uint y) {
+  return __hsail_mul24_u32(x, y);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_count_bits_i32(int x) {
+  return __hsail_popcount_u32_b32(x);
+}
+
+__attribute__((weak,always_inline)) int
+__amdil_ffb_hi_u32(uint x) {
+  return __hsail_firstbit_u32(x);
+}
+
+//#ifdef HSAIL_SPEC_CURRENT
+__attribute__((weak,always_inline)) float
+__amdil_fraction_f32(float x) {
+  return __hsail_fraction_f32(x);
+}
+
+__attribute__((weak,always_inline)) double
+__amdil_fraction_f64(double x) {
+  return __hsail_fraction_f64(x);
+}
+//#endif 
+

Added: libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/atomicWorkItemFence.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,100 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __OPENCL_C_VERSION__ >= 200
+extern void __atomic_memfence(uint flags, uint mo, uint msc);
+enum BrigMemoryFenceSegments {
+  BRIG_MEMORY_FENCE_NONE   = 0,
+  BRIG_MEMORY_FENCE_GROUP  = 1,
+  BRIG_MEMORY_FENCE_GLOBAL = 2,
+  BRIG_MEMORY_FENCE_BOTH   = 3,
+  BRIG_MEMORY_FENCE_IMAGE  = 4
+};
+
+enum BrigMemoryOrder {
+  BRIG_MEMORY_ORDER_NONE = 0,
+  BRIG_MEMORY_ORDER_RELAXED = 1,
+  BRIG_MEMORY_ORDER_ACQUIRE = 2,
+  BRIG_MEMORY_ORDER_RELEASE = 3,
+  BRIG_MEMORY_ORDER_ACQUIRE_RELEASE = 4
+};
+
+enum BrigMemoryScope {
+  BRIG_MEMORY_SCOPE_NONE = 0,
+  BRIG_MEMORY_SCOPE_WAVEFRONT = 1,
+  BRIG_MEMORY_SCOPE_WORKGROUP = 2,
+  BRIG_MEMORY_SCOPE_COMPONENT = 3,
+  BRIG_MEMORY_SCOPE_SYSTEM = 4,
+  BRIG_MEMORY_SCOPE_WORKITEM = 5
+};
+
+static inline uint getBrigMemoryOrder(memory_order mo) {
+  switch(mo) {
+    default : return BRIG_MEMORY_ORDER_NONE;
+    case memory_order_relaxed : return BRIG_MEMORY_ORDER_RELAXED;
+    case memory_order_release : return BRIG_MEMORY_ORDER_RELEASE;
+    case memory_order_acquire : return BRIG_MEMORY_ORDER_ACQUIRE;
+    case memory_order_acq_rel :
+    case memory_order_seq_cst : return BRIG_MEMORY_ORDER_ACQUIRE_RELEASE;
+  }
+}
+
+static inline uint getBrigMemoryScope(memory_scope msc) {
+  switch(msc) {
+    default :  return BRIG_MEMORY_SCOPE_NONE;
+    case memory_scope_work_group : return BRIG_MEMORY_SCOPE_WORKGROUP;
+    case memory_scope_device : return BRIG_MEMORY_SCOPE_COMPONENT;
+    case memory_scope_all_svm_devices : return BRIG_MEMORY_SCOPE_SYSTEM;
+    case memory_scope_sub_group : return BRIG_MEMORY_SCOPE_WAVEFRONT;
+    case memory_scope_work_item : return BRIG_MEMORY_SCOPE_WORKITEM;
+  }
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+atomic_work_item_fence(/*cl_mem_fence_flags*/ unsigned flag, memory_order mo, memory_scope msc) {
+  uint brigSegment = 0;
+  uint brigMemoryOrder = getBrigMemoryOrder(mo);
+  uint brigMemoryScope = BRIG_MEMORY_SCOPE_WORKGROUP;
+  // relaxed fence has no effect
+  if (mo == memory_order_relaxed) return;
+  if ((flag & CLK_GLOBAL_MEM_FENCE) && (flag & CLK_LOCAL_MEM_FENCE)) {
+    brigSegment = BRIG_MEMORY_FENCE_BOTH;
+    brigMemoryScope = getBrigMemoryScope(msc);
+  }
+  else if (flag & CLK_GLOBAL_MEM_FENCE) {
+    brigSegment = BRIG_MEMORY_FENCE_GLOBAL;
+    brigMemoryScope = getBrigMemoryScope(msc);
+  }
+  else if (flag & CLK_LOCAL_MEM_FENCE) {
+    brigSegment = BRIG_MEMORY_FENCE_GROUP;
+  }
+  if (brigSegment != 0) {
+    __atomic_memfence(brigSegment, brigMemoryOrder, brigMemoryScope);
+  }
+  if (flag & CLK_IMAGE_MEM_FENCE) {
+    brigMemoryScope = getBrigMemoryScope(msc);
+    __atomic_memfence(BRIG_MEMORY_FENCE_IMAGE, BRIG_MEMORY_ORDER_ACQUIRE_RELEASE, brigMemoryScope);
+  }
+}
+#endif // __OPENCL_C_VERSION__ >= 200

Added: libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/awgcpy.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,2696 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((pure)) int __hsail_workitemid_flat(void);
+
+__attribute__((always_inline)) static event_t
+__AWGClgI1(__local uchar * dst, const __global uchar * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local uchar *, const __global uchar *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI1"))) event_t async_work_group_copy(__local char *, const __global char *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI1(__local uchar *dst, const __global uchar *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local uchar *, const __global uchar *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI1"))) event_t async_work_group_strided_copy(__local char *, const __global char *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI1(__global uchar * dst, const __local uchar * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global uchar *, const __local uchar *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI1"))) event_t async_work_group_copy(__global char *, const __local char *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI1(__global uchar *dst, const __local uchar *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global uchar *, const __local uchar *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI1"))) event_t async_work_group_strided_copy(__global char *, const __local char *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI2(__local ushort * dst, const __global ushort * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local ushort *, const __global ushort *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI2"))) event_t async_work_group_copy(__local short *, const __global short *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI2(__local ushort *dst, const __global ushort *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local ushort *, const __global ushort *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI2"))) event_t async_work_group_strided_copy(__local short *, const __global short *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI2(__global ushort * dst, const __local ushort * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global ushort *, const __local ushort *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI2"))) event_t async_work_group_copy(__global short *, const __local short *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI2(__global ushort *dst, const __local ushort *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global ushort *, const __local ushort *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI2"))) event_t async_work_group_strided_copy(__global short *, const __local short *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI4(__local uint * dst, const __global uint * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local uint *, const __global uint *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI4"))) event_t async_work_group_copy(__local int *, const __global int *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI4(__local uint *dst, const __global uint *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local uint *, const __global uint *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI4"))) event_t async_work_group_strided_copy(__local int *, const __global int *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI4(__global uint * dst, const __local uint * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global uint *, const __local uint *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI4"))) event_t async_work_group_copy(__global int *, const __local int *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI4(__global uint *dst, const __local uint *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global uint *, const __local uint *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI4"))) event_t async_work_group_strided_copy(__global int *, const __local int *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClgI8(__local ulong * dst, const __global ulong * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local ulong *, const __global ulong *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClgI8"))) event_t async_work_group_copy(__local long *, const __global long *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClgI8(__local ulong *dst, const __global ulong *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local ulong *, const __global ulong *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClgI8"))) event_t async_work_group_strided_copy(__local long *, const __global long *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCglI8(__global ulong * dst, const __local ulong * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global ulong *, const __local ulong *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCglI8"))) event_t async_work_group_copy(__global long *, const __local long *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCglI8(__global ulong *dst, const __local ulong *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global ulong *, const __local ulong *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCglI8"))) event_t async_work_group_strided_copy(__global long *, const __local long *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float * dst, const __global float * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float *dst, const __global float *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float * dst, const __local float * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float *dst, const __local float *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double * dst, const __global double * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double *dst, const __global double *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double * dst, const __local double * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double *dst, const __local double *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I1(__local uchar2 * dst, const __global uchar2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local uchar2 *, const __global uchar2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I1"))) event_t async_work_group_copy(__local char2 *, const __global char2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I1(__local uchar2 *dst, const __global uchar2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local uchar2 *, const __global uchar2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I1"))) event_t async_work_group_strided_copy(__local char2 *, const __global char2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I1(__global uchar2 * dst, const __local uchar2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global uchar2 *, const __local uchar2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I1"))) event_t async_work_group_copy(__global char2 *, const __local char2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I1(__global uchar2 *dst, const __local uchar2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global uchar2 *, const __local uchar2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I1"))) event_t async_work_group_strided_copy(__global char2 *, const __local char2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I2(__local ushort2 * dst, const __global ushort2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local ushort2 *, const __global ushort2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I2"))) event_t async_work_group_copy(__local short2 *, const __global short2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I2(__local ushort2 *dst, const __global ushort2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local ushort2 *, const __global ushort2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I2"))) event_t async_work_group_strided_copy(__local short2 *, const __global short2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I2(__global ushort2 * dst, const __local ushort2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global ushort2 *, const __local ushort2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I2"))) event_t async_work_group_copy(__global short2 *, const __local short2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I2(__global ushort2 *dst, const __local ushort2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global ushort2 *, const __local ushort2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I2"))) event_t async_work_group_strided_copy(__global short2 *, const __local short2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I4(__local uint2 * dst, const __global uint2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local uint2 *, const __global uint2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I4"))) event_t async_work_group_copy(__local int2 *, const __global int2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I4(__local uint2 *dst, const __global uint2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local uint2 *, const __global uint2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I4"))) event_t async_work_group_strided_copy(__local int2 *, const __global int2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I4(__global uint2 * dst, const __local uint2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global uint2 *, const __local uint2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I4"))) event_t async_work_group_copy(__global int2 *, const __local int2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I4(__global uint2 *dst, const __local uint2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global uint2 *, const __local uint2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I4"))) event_t async_work_group_strided_copy(__global int2 *, const __local int2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg2I8(__local ulong2 * dst, const __global ulong2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local ulong2 *, const __global ulong2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg2I8"))) event_t async_work_group_copy(__local long2 *, const __global long2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg2I8(__local ulong2 *dst, const __global ulong2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local ulong2 *, const __global ulong2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg2I8"))) event_t async_work_group_strided_copy(__local long2 *, const __global long2 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl2I8(__global ulong2 * dst, const __local ulong2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global ulong2 *, const __local ulong2 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl2I8"))) event_t async_work_group_copy(__global long2 *, const __local long2 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl2I8(__global ulong2 *dst, const __local ulong2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global ulong2 *, const __local ulong2 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl2I8"))) event_t async_work_group_strided_copy(__global long2 *, const __local long2 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float2 * dst, const __global float2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float2 *dst, const __global float2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float2 * dst, const __local float2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float2 *dst, const __local float2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double2 * dst, const __global double2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double2 *dst, const __global double2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double2 * dst, const __local double2 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double2 *dst, const __local double2 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double2 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I1(__local uchar3 * dst, const __global uchar3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local uchar3 *, const __global uchar3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I1"))) event_t async_work_group_copy(__local char3 *, const __global char3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I1(__local uchar3 *dst, const __global uchar3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local uchar3 *, const __global uchar3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I1"))) event_t async_work_group_strided_copy(__local char3 *, const __global char3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I1(__global uchar3 * dst, const __local uchar3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global uchar3 *, const __local uchar3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I1"))) event_t async_work_group_copy(__global char3 *, const __local char3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I1(__global uchar3 *dst, const __local uchar3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global uchar3 *, const __local uchar3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I1"))) event_t async_work_group_strided_copy(__global char3 *, const __local char3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I2(__local ushort3 * dst, const __global ushort3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local ushort3 *, const __global ushort3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I2"))) event_t async_work_group_copy(__local short3 *, const __global short3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I2(__local ushort3 *dst, const __global ushort3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local ushort3 *, const __global ushort3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I2"))) event_t async_work_group_strided_copy(__local short3 *, const __global short3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I2(__global ushort3 * dst, const __local ushort3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global ushort3 *, const __local ushort3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I2"))) event_t async_work_group_copy(__global short3 *, const __local short3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I2(__global ushort3 *dst, const __local ushort3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global ushort3 *, const __local ushort3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I2"))) event_t async_work_group_strided_copy(__global short3 *, const __local short3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I4(__local uint3 * dst, const __global uint3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local uint3 *, const __global uint3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I4"))) event_t async_work_group_copy(__local int3 *, const __global int3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I4(__local uint3 *dst, const __global uint3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local uint3 *, const __global uint3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I4"))) event_t async_work_group_strided_copy(__local int3 *, const __global int3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I4(__global uint3 * dst, const __local uint3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global uint3 *, const __local uint3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I4"))) event_t async_work_group_copy(__global int3 *, const __local int3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I4(__global uint3 *dst, const __local uint3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global uint3 *, const __local uint3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I4"))) event_t async_work_group_strided_copy(__global int3 *, const __local int3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg3I8(__local ulong3 * dst, const __global ulong3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local ulong3 *, const __global ulong3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg3I8"))) event_t async_work_group_copy(__local long3 *, const __global long3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg3I8(__local ulong3 *dst, const __global ulong3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local ulong3 *, const __global ulong3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg3I8"))) event_t async_work_group_strided_copy(__local long3 *, const __global long3 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl3I8(__global ulong3 * dst, const __local ulong3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global ulong3 *, const __local ulong3 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl3I8"))) event_t async_work_group_copy(__global long3 *, const __local long3 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl3I8(__global ulong3 *dst, const __local ulong3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global ulong3 *, const __local ulong3 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl3I8"))) event_t async_work_group_strided_copy(__global long3 *, const __local long3 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float3 * dst, const __global float3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float3 *dst, const __global float3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float3 * dst, const __local float3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float3 *dst, const __local float3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double3 * dst, const __global double3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double3 *dst, const __global double3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double3 * dst, const __local double3 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double3 *dst, const __local double3 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double3 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I1(__local uchar4 * dst, const __global uchar4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local uchar4 *, const __global uchar4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I1"))) event_t async_work_group_copy(__local char4 *, const __global char4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I1(__local uchar4 *dst, const __global uchar4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local uchar4 *, const __global uchar4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I1"))) event_t async_work_group_strided_copy(__local char4 *, const __global char4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I1(__global uchar4 * dst, const __local uchar4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global uchar4 *, const __local uchar4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I1"))) event_t async_work_group_copy(__global char4 *, const __local char4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I1(__global uchar4 *dst, const __local uchar4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global uchar4 *, const __local uchar4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I1"))) event_t async_work_group_strided_copy(__global char4 *, const __local char4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I2(__local ushort4 * dst, const __global ushort4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local ushort4 *, const __global ushort4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I2"))) event_t async_work_group_copy(__local short4 *, const __global short4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I2(__local ushort4 *dst, const __global ushort4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local ushort4 *, const __global ushort4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I2"))) event_t async_work_group_strided_copy(__local short4 *, const __global short4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I2(__global ushort4 * dst, const __local ushort4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global ushort4 *, const __local ushort4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I2"))) event_t async_work_group_copy(__global short4 *, const __local short4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I2(__global ushort4 *dst, const __local ushort4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global ushort4 *, const __local ushort4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I2"))) event_t async_work_group_strided_copy(__global short4 *, const __local short4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I4(__local uint4 * dst, const __global uint4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local uint4 *, const __global uint4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I4"))) event_t async_work_group_copy(__local int4 *, const __global int4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I4(__local uint4 *dst, const __global uint4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local uint4 *, const __global uint4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I4"))) event_t async_work_group_strided_copy(__local int4 *, const __global int4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I4(__global uint4 * dst, const __local uint4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global uint4 *, const __local uint4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I4"))) event_t async_work_group_copy(__global int4 *, const __local int4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I4(__global uint4 *dst, const __local uint4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global uint4 *, const __local uint4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I4"))) event_t async_work_group_strided_copy(__global int4 *, const __local int4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg4I8(__local ulong4 * dst, const __global ulong4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local ulong4 *, const __global ulong4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg4I8"))) event_t async_work_group_copy(__local long4 *, const __global long4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg4I8(__local ulong4 *dst, const __global ulong4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local ulong4 *, const __global ulong4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg4I8"))) event_t async_work_group_strided_copy(__local long4 *, const __global long4 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl4I8(__global ulong4 * dst, const __local ulong4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global ulong4 *, const __local ulong4 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl4I8"))) event_t async_work_group_copy(__global long4 *, const __local long4 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl4I8(__global ulong4 *dst, const __local ulong4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global ulong4 *, const __local ulong4 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl4I8"))) event_t async_work_group_strided_copy(__global long4 *, const __local long4 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float4 * dst, const __global float4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float4 *dst, const __global float4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float4 * dst, const __local float4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float4 *dst, const __local float4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double4 * dst, const __global double4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double4 *dst, const __global double4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double4 * dst, const __local double4 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double4 *dst, const __local double4 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double4 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I1(__local uchar8 * dst, const __global uchar8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local uchar8 *, const __global uchar8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I1"))) event_t async_work_group_copy(__local char8 *, const __global char8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I1(__local uchar8 *dst, const __global uchar8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local uchar8 *, const __global uchar8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I1"))) event_t async_work_group_strided_copy(__local char8 *, const __global char8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I1(__global uchar8 * dst, const __local uchar8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global uchar8 *, const __local uchar8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I1"))) event_t async_work_group_copy(__global char8 *, const __local char8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I1(__global uchar8 *dst, const __local uchar8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global uchar8 *, const __local uchar8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I1"))) event_t async_work_group_strided_copy(__global char8 *, const __local char8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I2(__local ushort8 * dst, const __global ushort8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local ushort8 *, const __global ushort8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I2"))) event_t async_work_group_copy(__local short8 *, const __global short8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I2(__local ushort8 *dst, const __global ushort8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local ushort8 *, const __global ushort8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I2"))) event_t async_work_group_strided_copy(__local short8 *, const __global short8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I2(__global ushort8 * dst, const __local ushort8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global ushort8 *, const __local ushort8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I2"))) event_t async_work_group_copy(__global short8 *, const __local short8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I2(__global ushort8 *dst, const __local ushort8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global ushort8 *, const __local ushort8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I2"))) event_t async_work_group_strided_copy(__global short8 *, const __local short8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I4(__local uint8 * dst, const __global uint8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local uint8 *, const __global uint8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I4"))) event_t async_work_group_copy(__local int8 *, const __global int8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I4(__local uint8 *dst, const __global uint8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local uint8 *, const __global uint8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I4"))) event_t async_work_group_strided_copy(__local int8 *, const __global int8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I4(__global uint8 * dst, const __local uint8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global uint8 *, const __local uint8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I4"))) event_t async_work_group_copy(__global int8 *, const __local int8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I4(__global uint8 *dst, const __local uint8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global uint8 *, const __local uint8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I4"))) event_t async_work_group_strided_copy(__global int8 *, const __local int8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg8I8(__local ulong8 * dst, const __global ulong8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local ulong8 *, const __global ulong8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg8I8"))) event_t async_work_group_copy(__local long8 *, const __global long8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg8I8(__local ulong8 *dst, const __global ulong8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local ulong8 *, const __global ulong8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg8I8"))) event_t async_work_group_strided_copy(__local long8 *, const __global long8 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl8I8(__global ulong8 * dst, const __local ulong8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global ulong8 *, const __local ulong8 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl8I8"))) event_t async_work_group_copy(__global long8 *, const __local long8 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl8I8(__global ulong8 *dst, const __local ulong8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global ulong8 *, const __local ulong8 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl8I8"))) event_t async_work_group_strided_copy(__global long8 *, const __local long8 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float8 * dst, const __global float8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float8 *dst, const __global float8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float8 * dst, const __local float8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float8 *dst, const __local float8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double8 * dst, const __global double8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double8 *dst, const __global double8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double8 * dst, const __local double8 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double8 *dst, const __local double8 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double8 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I1(__local uchar16 * dst, const __global uchar16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local uchar16 *, const __global uchar16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I1"))) event_t async_work_group_copy(__local char16 *, const __global char16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I1(__local uchar16 *dst, const __global uchar16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local uchar16 *, const __global uchar16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I1"))) event_t async_work_group_strided_copy(__local char16 *, const __global char16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I1(__global uchar16 * dst, const __local uchar16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global uchar16 *, const __local uchar16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I1"))) event_t async_work_group_copy(__global char16 *, const __local char16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I1(__global uchar16 *dst, const __local uchar16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global uchar16 *, const __local uchar16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I1"))) event_t async_work_group_strided_copy(__global char16 *, const __local char16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uchar16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global char16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I2(__local ushort16 * dst, const __global ushort16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local ushort16 *, const __global ushort16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I2"))) event_t async_work_group_copy(__local short16 *, const __global short16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I2(__local ushort16 *dst, const __global ushort16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local ushort16 *, const __global ushort16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I2"))) event_t async_work_group_strided_copy(__local short16 *, const __global short16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I2(__global ushort16 * dst, const __local ushort16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global ushort16 *, const __local ushort16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I2"))) event_t async_work_group_copy(__global short16 *, const __local short16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I2(__global ushort16 *dst, const __local ushort16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global ushort16 *, const __local ushort16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I2"))) event_t async_work_group_strided_copy(__global short16 *, const __local short16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ushort16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global short16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I4(__local uint16 * dst, const __global uint16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local uint16 *, const __global uint16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I4"))) event_t async_work_group_copy(__local int16 *, const __global int16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I4(__local uint16 *dst, const __global uint16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local uint16 *, const __global uint16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I4"))) event_t async_work_group_strided_copy(__local int16 *, const __global int16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I4(__global uint16 * dst, const __local uint16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global uint16 *, const __local uint16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I4"))) event_t async_work_group_copy(__global int16 *, const __local int16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I4(__global uint16 *dst, const __local uint16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global uint16 *, const __local uint16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I4"))) event_t async_work_group_strided_copy(__global int16 *, const __local int16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global uint16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global int16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((always_inline)) static event_t
+__AWGClg16I8(__local ulong16 * dst, const __global ulong16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local ulong16 *, const __global ulong16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGClg16I8"))) event_t async_work_group_copy(__local long16 *, const __global long16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSClg16I8(__local ulong16 *dst, const __global ulong16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local ulong16 *, const __global ulong16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSClg16I8"))) event_t async_work_group_strided_copy(__local long16 *, const __global long16 *, size_t, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGCgl16I8(__global ulong16 * dst, const __local ulong16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global ulong16 *, const __local ulong16 *, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGCgl16I8"))) event_t async_work_group_copy(__global long16 *, const __local long16 *, size_t, event_t);
+
+__attribute__((always_inline)) static event_t
+__AWGSCgl16I8(__global ulong16 *dst, const __local ulong16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global ulong16 *, const __local ulong16 *, size_t, size_t, event_t);
+extern __attribute__((overloadable, weak, alias("__AWGSCgl16I8"))) event_t async_work_group_strided_copy(__global long16 *, const __local long16 *, size_t, size_t, event_t);
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global ulong16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global long16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local float16 * dst, const __global float16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local float16 *dst, const __global float16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global float16 * dst, const __local float16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global float16 *dst, const __local float16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global float16 *p, size_t n)
+{
+    // nothing to do
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__local double16 * dst, const __global double16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__local double16 *dst, const __global double16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i*j];
+        i += d;
+    }
+    barrier(CLK_LOCAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_copy(__global double16 * dst, const __local double16 * src, size_t n, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) event_t
+async_work_group_strided_copy(__global double16 *dst, const __local double16 *src, size_t n, size_t j, event_t e)
+{
+    int4 ls = (int4)(get_local_size(0), get_local_size(1), get_local_size(2), 0);
+    size_t i = __hsail_workitemid_flat();
+    size_t d = ls.x * ls.y * ls.z;
+    while (i < n) {
+        dst[i*j] = src[i];
+        i += d;
+    }
+    barrier(CLK_GLOBAL_MEM_FENCE);
+    return e;
+}
+
+__attribute__((overloadable, always_inline, weak)) void
+prefetch(const __global double16 *p, size_t n)
+{
+    // nothing to do
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) void
+wait_group_events(int num_events, event_t *event_list)
+{
+    // Nothing to do
+}

Added: libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/bitsel.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((pure)) uint __amdil_bfi_u32(uint, uint, uint);
+
+// [u]int
+
+__attribute__((always_inline)) static uint
+__BSELI4(uint a, uint b, uint c)
+{
+    return __amdil_bfi_u32(c, b, a);
+}
+
+extern __attribute__((overloadable, alias("__BSELI4"))) uint bitselect(uint, uint, uint);
+extern __attribute__((overloadable, alias("__BSELI4"))) int bitselect(int, int, int);
+
+// float
+
+__attribute__((overloadable, always_inline)) float
+bitselect(float a, float b, float c)
+{
+    return as_float(__amdil_bfi_u32(as_uint(c), as_uint(b), as_uint(a)));
+}
+
+// [u]long
+
+// No __amdil equivalent, so use __hsail intrinsic here
+extern __attribute__((const)) ulong __hsail_bitselect_u64(ulong, ulong, ulong);
+
+__attribute__((always_inline)) static ulong
+__BSELI8(ulong a, ulong b, ulong c)
+{
+    return __hsail_bitselect_u64(c, b, a);
+}
+
+extern __attribute__((overloadable, alias("__BSELI8"))) ulong bitselect(ulong, ulong, ulong);
+extern __attribute__((overloadable, alias("__BSELI8"))) long bitselect(long, long, long);
+
+// double
+
+__attribute__((overloadable, always_inline)) double
+bitselect(double a, double b, double c)
+{
+    return as_double(__hsail_bitselect_u64(as_ulong(c), as_ulong(b), as_ulong(a)));
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/misc/class.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/class.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/class.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/class.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#define SNAN 0x001
+#define QNAN 0x002
+#define NINF 0x004
+#define NNOR 0x008
+#define NSUB 0x010
+#define NZER 0x020
+#define PZER 0x040
+#define PSUB 0x080
+#define PNOR 0x100
+#define PINF 0x200
+
+extern __attribute__((pure)) int __amdil_class_f32(float, int);
+extern __attribute__((pure)) int __amdil_class_f64(double, int);
+
+#define FC(F,M) \
+__attribute__((overloadable, always_inline)) int \
+F(float x) \
+{ \
+    return __amdil_class_f32(x, M) & 1; \
+} \
+__attribute__((overloadable, always_inline)) int2 \
+F(float2 x) \
+{ \
+    int2 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int3 \
+F(float3 x) \
+{ \
+    int3 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int4 \
+F(float4 x) \
+{ \
+    int4 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    ret.s3 = __amdil_class_f32(x.s3, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int8 \
+F(float8 x) \
+{ \
+    int8 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    ret.s3 = __amdil_class_f32(x.s3, M); \
+    ret.s4 = __amdil_class_f32(x.s4, M); \
+    ret.s5 = __amdil_class_f32(x.s5, M); \
+    ret.s6 = __amdil_class_f32(x.s6, M); \
+    ret.s7 = __amdil_class_f32(x.s7, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) int16 \
+F(float16 x) \
+{ \
+    int16 ret; \
+    ret.s0 = __amdil_class_f32(x.s0, M); \
+    ret.s1 = __amdil_class_f32(x.s1, M); \
+    ret.s2 = __amdil_class_f32(x.s2, M); \
+    ret.s3 = __amdil_class_f32(x.s3, M); \
+    ret.s4 = __amdil_class_f32(x.s4, M); \
+    ret.s5 = __amdil_class_f32(x.s5, M); \
+    ret.s6 = __amdil_class_f32(x.s6, M); \
+    ret.s7 = __amdil_class_f32(x.s7, M); \
+    ret.s8 = __amdil_class_f32(x.s8, M); \
+    ret.s9 = __amdil_class_f32(x.s9, M); \
+    ret.sa = __amdil_class_f32(x.sa, M); \
+    ret.sb = __amdil_class_f32(x.sb, M); \
+    ret.sc = __amdil_class_f32(x.sc, M); \
+    ret.sd = __amdil_class_f32(x.sd, M); \
+    ret.se = __amdil_class_f32(x.se, M); \
+    ret.sf = __amdil_class_f32(x.sf, M); \
+    return ret; \
+}
+
+
+#define DC(F,M) \
+__attribute__((overloadable, always_inline)) int \
+F(double x) \
+{ \
+    return __amdil_class_f64(x, M) & 1; \
+} \
+__attribute__((overloadable, always_inline)) long2 \
+F(double2 x) \
+{ \
+    long2 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long3 \
+F(double3 x) \
+{ \
+    long3 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long4 \
+F(double4 x) \
+{ \
+    long4 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    ret.s3 = __amdil_class_f64(x.s3, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long8 \
+F(double8 x) \
+{ \
+    long8 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    ret.s3 = __amdil_class_f64(x.s3, M); \
+    ret.s4 = __amdil_class_f64(x.s4, M); \
+    ret.s5 = __amdil_class_f64(x.s5, M); \
+    ret.s6 = __amdil_class_f64(x.s6, M); \
+    ret.s7 = __amdil_class_f64(x.s7, M); \
+    return ret; \
+} \
+__attribute__((overloadable, always_inline)) long16 \
+F(double16 x) \
+{ \
+    long16 ret; \
+    ret.s0 = __amdil_class_f64(x.s0, M); \
+    ret.s1 = __amdil_class_f64(x.s1, M); \
+    ret.s2 = __amdil_class_f64(x.s2, M); \
+    ret.s3 = __amdil_class_f64(x.s3, M); \
+    ret.s4 = __amdil_class_f64(x.s4, M); \
+    ret.s5 = __amdil_class_f64(x.s5, M); \
+    ret.s6 = __amdil_class_f64(x.s6, M); \
+    ret.s7 = __amdil_class_f64(x.s7, M); \
+    ret.s8 = __amdil_class_f64(x.s8, M); \
+    ret.s9 = __amdil_class_f64(x.s9, M); \
+    ret.sa = __amdil_class_f64(x.sa, M); \
+    ret.sb = __amdil_class_f64(x.sb, M); \
+    ret.sc = __amdil_class_f64(x.sc, M); \
+    ret.sd = __amdil_class_f64(x.sd, M); \
+    ret.se = __amdil_class_f64(x.se, M); \
+    ret.sf = __amdil_class_f64(x.sf, M); \
+    return ret; \
+}
+
+FC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR))
+FC(isinf, (NINF|PINF))
+FC(isnan, (SNAN|QNAN))
+FC(isnormal, (NNOR|PNOR))
+
+DC(isfinite, (NNOR|NSUB|NZER|PZER|PSUB|PNOR))
+DC(isinf, (NINF|PINF))
+DC(isnan, (SNAN|QNAN))
+DC(isnormal, (NNOR|PNOR))
+

Added: libclc/branches/amd-builtins/amd-builtins/misc/counter.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/counter.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/counter.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/counter.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,44 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#ifdef USE_COUNTER
+
+#pragma OPENCL EXTENSION cl_amd_atomic_counters32 : enable
+
+extern uint __amdil_append_alloc_i32(counter32_t);
+extern uint __amdil_append_consume_i32(counter32_t);
+
+__attribute__((overloadable, always_inline)) uint
+atomic_inc(counter32_t p)
+{
+    return __amdil_append_alloc_i32(p);
+}
+
+__attribute__((overloadable, always_inline)) uint
+atomic_dec(counter32_t p)
+{
+    // The instruction returns the updated value
+    return __amdil_append_consume_i32(p) + 1U;
+}
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/floattointconversion.h Tue Oct  7 12:10:46 2014
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+
+static inline  double float_uint_to_double(uint x)
+{
+    double d;
+    float f = as_float(x);
+
+	// Fix up subnormal, if necessary
+	uint fmant = x & 0x007fffff;
+	float temp = as_float(fmant | 0x3f800000);
+	temp -= 1.0;
+	d = (float)temp;
+	ulong ld = as_ulong(d);
+	ld -= 0x07e0000000000000;
+	d = as_double(ld);
+	d = fmant ? d : 0.0;
+	d = x & 0x80000000 ? -d : d;
+	d = (f != 0.0) ? (double)f : d;
+
+	return d;
+
+}
+
+static inline uint double_to_float_uint(double d)
+{
+	uint dlow, dhigh, dsign;
+	float f = (float)d;
+	uint uf;
+
+	double dabs = (d < 0.) ? -d : d;
+
+	// Fix up subnormal
+	ulong ld;
+	ld = as_ulong(d);
+	dlow = ld;
+	dhigh = ld >> 32;
+	dsign = dhigh & 0x80000000;
+
+	int dexp = (dhigh >> 20) & 0x7ff;
+	int shiftcount = 0x381 - dexp;
+	dhigh &= 0x000fffff;
+	dhigh |= 0x00100000;
+	dhigh = (dhigh << 3) | (dlow >> 29);
+	dlow <<= 3;
+	uint extrabits = dlow << (32 - shiftcount);
+	dlow = (dlow >> shiftcount) | (dhigh << (32 - shiftcount));
+	dhigh >>= shiftcount;
+	dhigh = ((dlow > 0x80000000u) ||
+	((dlow == 0x80000000u) && ((dhigh & 1) | extrabits))) ?
+	   dhigh + 1 : dhigh;
+	uf = dhigh | dsign;
+	uf =  dabs >= 7.0064923216240869000000e-046 ? uf : 0;
+
+
+	uf = f != 0. ? as_uint(f) : uf;
+    return uf;
+}
\ No newline at end of file

Added: libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/minmax.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//#define G(F,T,N) \
+//__attribute__((overloadable, always_inline)) T##N \
+//F(T##N x, T##N y) \
+//{ \
+//    T##N ret; \
+//    ret.lo = F(x.lo, y.lo); \
+//    ret.hi = F(x.hi, y.hi); \
+//    return ret; \
+//}
+//
+//G(min,float,16)
+//G(min,float,8)
+
+//__attribute__((overloadable, always_inline)) float4
+//min(float4 x, float4 y)
+//{
+//    return __amdil_min_v4f32(x, y);
+//}
+//
+//__attribute__((overloadable, always_inline)) float3
+//min(float3 x, float3 y)
+//{
+//#if defined VEC3_BACKEND
+//    return __amdil_min_v3f32(x, y);
+//#else
+//    float3 ret;
+//    ret.xy = min(x.xy, y.xy);
+//    ret.z = min(x.z, y.z);
+//    return ret;
+//#endif
+//}
+//
+//__attribute__((overloadable, always_inline)) float2
+//min(float2 x, float2 y)
+//{
+//    return __amdil_min_v2f32(x, y);
+//}
+
+extern __attribute__((pure)) float __hsail_min_f32(float,float);
+
+__attribute__((weak, overloadable, always_inline)) float
+min(float x, float y)
+{
+    return __hsail_min_f32(x, y);
+}
+
+//G(min,double,16)
+//G(min,double,8)
+//G(min,double,4)
+//G(min,double,3)
+//G(min,double,2)
+
+extern __attribute__((pure)) double __hsail_min_f64(double,double);
+
+__attribute__((weak, overloadable, always_inline)) double
+min(double x, double y)
+{
+    return __hsail_min_f64(x, y);
+}
+
+//G(max,float,16)
+//G(max,float,8)
+//
+//__attribute__((overloadable, always_inline)) float4
+//max(float4 x, float4 y)
+//{
+//    return __amdil_max_v4f32(x, y);
+//}
+//
+//__attribute__((overloadable, always_inline)) float3
+//max(float3 x, float3 y)
+//{
+//#if defined VEC3_BACKEND
+//    return __amdil_max_v3f32(x, y);
+//#else
+//    float3 ret;
+//    ret.xy = max(x.xy, y.xy);
+//    ret.z = max(x.z, y.z);
+//    return ret;
+//#endif
+//}
+//
+//__attribute__((overloadable, always_inline)) float2
+//max(float2 x, float2 y)
+//{
+//    return __amdil_max_v2f32(x, y);
+//}
+
+extern __attribute__((pure)) float __hsail_max_f32(float,float);
+
+__attribute__((weak, overloadable, always_inline)) float
+max(float x, float y)
+{
+    return __hsail_max_f32(x, y);
+}
+
+//G(max,double,16)
+//G(max,double,8)
+//G(max,double,4)
+//G(max,double,3)
+//G(max,double,2)
+
+extern __attribute__((pure)) double __hsail_max_f64(double,double);
+
+__attribute__((weak, overloadable, always_inline)) double
+max(double x, double y)
+{
+    return __hsail_max_f64(x, y);
+}

Added: libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/printf_alloc.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#if __OPENCL_C_VERSION__ >= 200
+
+#ifndef NULL
+#define NULL 0
+#endif
+
+extern __attribute__((const)) uint  __hsail_ld_kernarg_u32(uint);
+extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint);
+
+#define OFFSET 8
+
+__global char* __printf_alloc(unsigned int bytes)
+{
+  // Functionality:
+  // The __get_printf_ptr is a builtin that is replaced by
+  // the backend.  The first 8 bytes of the buffer returned
+  // by the call are skipped.
+  // buffer[0] maintains the latest offset in the buffer. The value
+  //  is updated using atomic adds for the number of bytes
+  //  requested in the function argument.
+  // buffer[4] has the size of the buffer
+  //  when access needs to go over buffer[0] + size of buffer
+  //  i.e. we have the buffer overflow condition -- we return NULL
+  // The buffer size is hard limited by sizeof(uint)
+  //
+  __global char* ptr;
+  if (sizeof(size_t) == 4)
+    ptr = (__global char*) __hsail_ld_kernarg_u32(12);
+  else
+    ptr = (__global char*) __hsail_ld_kernarg_u64(24);
+  uint size = ((global uint *)ptr)[1];
+  uint offset = atomic_load_explicit((__global atomic_uint *)ptr,
+                                     memory_order_acquire, memory_scope_device);
+  for (;;) {
+    if (OFFSET + offset + bytes > size)
+      return NULL;
+    if (atomic_compare_exchange_strong_explicit((__global atomic_uint *)ptr,
+        &offset, offset+bytes, memory_order_acq_rel, memory_order_acquire,
+        memory_scope_device))
+      break;
+  }
+  return ptr + OFFSET + offset;
+}
+#endif

Added: libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/relationals.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// Vector expansions for HSAIL relationals
+
+#define UnaryRelationalVector(oty, ity, fun, mgl) \
+__attribute__((weak,always_inline)) \
+oty##16 __##fun##_16##mgl(ity##16 a) \
+{ \
+	oty##16 c; \
+  c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##8 __##fun##_8##mgl(ity##8 a) \
+{ \
+	oty##8 c; \
+	c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##4 __##fun##_4##mgl(ity##4 a) \
+{ \
+	oty##4 c; \
+	c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##3 __##fun##_3##mgl(ity##3 a) \
+{ \
+	oty##3 c; \
+	c.xy = fun(a.xy); \
+	c.z = fun(a.z); \
+	return c; \
+} \
+__attribute__((weak,always_inline)) \
+oty##2 __##fun##_2##mgl(ity##2 a) \
+{ \
+	oty##2 c; \
+	c.lo = fun(a.lo); \
+	c.hi = fun(a.hi); \
+	return c; \
+}
+
+UnaryRelationalVector(int, float, isfinite, f32)
+UnaryRelationalVector(long, double, isfinite, f64)
+
+UnaryRelationalVector(int, float, isinf, f32)
+UnaryRelationalVector(long, double, isinf, f64)
+
+UnaryRelationalVector(int, float, isnan, f32)
+UnaryRelationalVector(long, double, isnan, f64)
+
+UnaryRelationalVector(int, float, isnormal, f32)
+UnaryRelationalVector(long, double, isnormal, f64)
+

Added: libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/synchronization.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,64 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern void __hsail_memfence();
+extern void __hsail_memfence_global();
+extern void __hsail_memfence_group();
+extern void __hsail_barrier();
+
+void mem_fence_impl(uint val) {
+  if (val == CLK_GLOBAL_MEM_FENCE) {
+    __hsail_memfence_global();
+  } else if (val == CLK_LOCAL_MEM_FENCE) {
+    __hsail_memfence_group();
+  } else {
+    __hsail_memfence();
+  }
+}
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void mem_fence(uint val) {
+  mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void read_mem_fence(uint val) {
+  mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+void write_mem_fence(uint val) {
+  mem_fence_impl(val);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline))
+void barrier(uint flags) {
+  __hsail_barrier();
+}

Added: libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/misc/workitem.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,238 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern __attribute__((const)) uint __hsail_get_global_size(uint);
+extern __attribute__((const)) uint __hsail_get_global_id(uint);
+extern __attribute__((const)) uint __hsail_workgroup_size(uint);
+extern __attribute__((const)) uint __hsail_currentworkgroup_size(uint);
+extern __attribute__((const)) uint __hsail_get_local_id(uint);
+extern __attribute__((const)) uint __hsail_get_num_groups(uint);
+extern __attribute__((const)) uint __hsail_get_group_id(uint);
+extern __attribute__((const)) uint   __hsail_get_work_dim(void);
+extern __attribute__((const)) uint  __hsail_ld_kernarg_u32(uint);
+extern __attribute__((const)) ulong __hsail_ld_kernarg_u64(uint);
+extern __attribute__((pure)) uint __hsail_workitemid_flat(void);
+
+// FIXME - this will change to ulong soon
+extern __attribute__((pure)) uint __hsail_workitemid_flatabs(void);
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_offset(uint d) {
+  if (sizeof(size_t) == 4) { // 32 bit
+    switch(d) {
+      default:
+        return 0;
+      case 0:
+        return __hsail_ld_kernarg_u32(0);
+      case 1:
+        return __hsail_ld_kernarg_u32(4);
+      case 2:
+        return __hsail_ld_kernarg_u32(8);
+     }
+  } else { // 64 bit
+    switch(d) {
+      default:
+        return 0;
+      case 0:
+        return __hsail_ld_kernarg_u64(0);
+      case 1:
+        return __hsail_ld_kernarg_u64(8);
+      case 2:
+        return __hsail_ld_kernarg_u64(16);
+    }
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_id(uint d) {
+  size_t id;
+  size_t o = get_global_offset(d);
+  switch(d) {
+    default:
+      id = 0;
+      break;
+    case 0:
+      id = __hsail_get_global_id(0);
+      break;
+    case 1:
+      id = __hsail_get_global_id(1);
+      break;
+    case 2:
+      id = __hsail_get_global_id(2);
+      break;
+  }
+
+  return o + id;
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_id(uint d) {
+  switch(d) {
+    default:
+      return 0;
+    case 0:
+      return __hsail_get_local_id(0);
+    case 1:
+      return __hsail_get_local_id(1);
+    case 2:
+      return __hsail_get_local_id(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_group_id(uint d) {
+  switch(d) {
+    default:
+      return 0;
+    case 0:
+      return __hsail_get_group_id(0);
+    case 1:
+      return __hsail_get_group_id(1);
+    case 2:
+      return __hsail_get_group_id(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_size(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_get_global_size(0);
+    case 1:
+      return __hsail_get_global_size(1);
+    case 2:
+      return __hsail_get_global_size(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_size(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_currentworkgroup_size(0);
+    case 1:
+      return __hsail_currentworkgroup_size(1);
+    case 2:
+      return __hsail_currentworkgroup_size(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_num_groups(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_get_num_groups(0);
+    case 1:
+      return __hsail_get_num_groups(1);
+    case 2:
+      return __hsail_get_num_groups(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+uint get_work_dim() {
+  return __hsail_get_work_dim();
+}
+
+#if __OPENCL_C_VERSION__ >= 200
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_enqueued_local_size(uint d) {
+  switch(d) {
+    default:
+      return 1;
+    case 0:
+      return __hsail_workgroup_size(0);
+    case 1:
+      return __hsail_workgroup_size(1);
+    case 2:
+      return __hsail_workgroup_size(2);
+  }
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_global_linear_id(void) {
+#if defined NO_WORKITEM_FLATABS
+    return (__hsail_get_global_id(2) * __hsail_get_global_size(1) +
+            __hsail_get_global_id(1)) * __hsail_get_global_size(0) +
+	    __hsail_get_global_id(0);
+#else
+    return __hsail_workitemid_flatabs();
+#endif
+}
+
+#ifdef __clang__
+    __attribute__((always_inline, overloadable))
+#else
+__attribute__((always_inline))
+#endif
+size_t get_local_linear_id(void) {
+    return __hsail_workitemid_flat();
+}
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/commitp.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,109 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+// Work group functions
+
+#define __WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__work_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__work_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__work_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__work_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+// sub group functions
+
+#define __SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__sub_group_commit_read_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__sub_group_commit_read_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+
+#define __SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) void \
+__sub_group_commit_write_pipe_internal_##SIZE(__global struct pipeimp* p, size_t rid) \
+{ \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_COMMIT_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) void
+__sub_group_commit_write_pipe_internal_user(__global struct pipeimp* p, size_t rid, size_t size)
+{
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/getp.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __GET_PIPE_NUM_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) uint \
+__get_pipe_num_packets_internal_##SIZE(__global struct pipeimp* p) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    return (uint)(wi - ri); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__GET_PIPE_NUM_PACKETS_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) uint
+__get_pipe_num_packets_internal_user(__global struct pipeimp* p, size_t size)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    return (uint)(wi - ri);
+}
+
+#define __GET_PIPE_MAX_PACKETS_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) uint \
+__get_pipe_max_packets_internal_##SIZE(__global struct pipeimp* p) \
+{ \
+    return (uint)p->end_idx; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__GET_PIPE_MAX_PACKETS_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) uint
+__get_pipe_max_packets_internal_user(__global struct pipeimp* p, size_t size)
+{
+    return (uint)p->end_idx;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/memcpyia.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,70 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+__attribute__((always_inline, weak)) void 
+__memcpy_internal_aligned(void *d, const void *s, size_t size, size_t align)
+{
+    if (align == 2) {
+	short *d2 = (short *)d;
+	short *s2 = (short *)s;
+	short *e2 = s2 + size/2;
+
+	while (s2 < e2)
+	    *d2++ = *s2++;
+    } else if (align == 4) {
+	int *d4 = (int *)d;
+	int *s4 = (int *)s;
+	int *e4 = s4 + size/4;
+
+	while (s4 < e4)
+	    *d4++ = *s4++;
+    } else if (align == 8) {
+	long *d8 = (long *)d;
+	long *s8 = (long *)s;
+	long *e8 = s8 + size/8;
+
+	while (s8 < e8)
+	    *d8++ = *s8++;
+    } else if (align == 16) {
+	long2 *d16 = (long2 *)d;
+	long2 *s16 = (long2 *)s;
+	long2 *e16 = s16 + size/16;
+
+	while (s16 < e16)
+	    *d16++ = *s16++;
+    } else if (align == 32 || align == 64 || align == 128) {
+	long4 *d32 = (long4 *)d;
+	long4 *s32 = (long4 *)s;
+	long4 *e32 = s32 + size/32;
+
+	while (s32 < e32)
+	    *d32++ = *s32++;
+    } else {
+	char *d1 = (char *)d;
+	char *s1 = (char *)s;
+	char *e1 = s1 + size;
+
+	while (s1 < e1)
+	    *d1++ = *s1++;
+    }
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/pipes.h Tue Oct  7 12:10:46 2014
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#ifndef _PIPES_H
+#define _PIPES_H 1
+
+#pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+#pragma OPENCL EXTENSION cl_khr_int64_extended_atomics : enable
+ 
+#define DO_PIPE_INTERNAL_SIZE(F) \
+F(1,uchar) \
+F(2,ushort) \
+F(4,uint) \
+F(8,ulong) \
+F(16,ulong2) \
+F(32,ulong4) \
+F(64,ulong8) \
+F(128,ulong16)
+
+struct pipeimp {
+    atomic_size_t read_idx;
+    atomic_size_t write_idx;
+    size_t end_idx;
+    uchar pad[128 - 3*sizeof(size_t)];
+    uchar packets[1];
+};
+
+extern void __memcpy_internal_aligned(void *, const void *, size_t, size_t);
+
+static inline size_t
+reserve(volatile __global atomic_size_t *pidx, size_t lim, size_t n)
+{
+    size_t idx = atomic_load_explicit(pidx, memory_order_acquire, memory_scope_device);
+
+    for (;;) {
+	if (idx + n > lim)
+	    return ~(size_t)0;
+
+	if (atomic_compare_exchange_strong_explicit(pidx, &idx, idx + n, memory_order_acq_rel, memory_order_acquire, memory_scope_device))
+	    break;
+    }
+
+    return idx;
+}
+
+#endif // _PIPES_H
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/readp.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,88 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__read_pipe_internal_##SIZE(__global struct pipeimp* p, STYPE* ptr) \
+{ \
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ri = reserve(&p->read_idx, wi, 1); \
+    if (ri == ~(size_t)0) \
+        return -1; \
+ \
+    *ptr = ((__global STYPE *)p->packets)[ri % p->end_idx]; \
+ \
+    if (ri == wi-1) { \
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    }\
+\
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__read_pipe_internal_user( __global struct pipeimp* p, void* ptr, size_t size, size_t align)
+{
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t ri = reserve(&p->read_idx, wi, 1);
+    if (ri == ~(size_t)0)
+        return -1;
+
+    __memcpy_internal_aligned(ptr, p->packets + (ri % p->end_idx)*size, size, align);
+
+    if (ri == wi-1) {
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return 0;
+}
+
+#define __READ_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__read_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, STYPE* ptr)  \
+{ \
+    rid += i; \
+    *ptr = ((__global STYPE *)p->packets)[rid % p->end_idx]; \
+ \
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__READ_PIPE_INDEXED_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__read_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, void *ptr, size_t size, size_t align)
+{
+    rid += i;
+
+    __memcpy_internal_aligned(ptr, p->packets + (rid % p->end_idx)*size, size, align);
+
+    return 0;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/reservep.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,235 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+#include "pipes.h"
+#include "../workgroup/wg.h"
+
+#define __RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+    size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+    if (rid + num_packets == wi) { \
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+    } \
+ \
+    return rid; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+    size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+    if (rid + num_packets == wi) {
+        atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+        atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+    }
+
+    return rid;
+}
+
+#define __RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    return reserve(&p->write_idx, ri + ei, num_packets); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    return reserve(&p->write_idx, ri + ei, num_packets);
+}
+
+// Work group functions
+
+#define __WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__work_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__wg_scratch; \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        size_t rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+ \
+        *t = rid; \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__work_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    __local size_t *t = (__local size_t *)__wg_scratch;
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        size_t rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+
+        *t = rid;
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+#define __WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__work_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    __local size_t *t = (__local size_t *)__wg_scratch; \
+ \
+    if ((int)get_local_linear_id() == 0) { \
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        *t = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return *t; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WORK_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__work_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    __local size_t *t = (__local size_t *)__wg_scratch;
+
+    if ((int)get_local_linear_id() == 0) {
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        *t = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    work_group_barrier(CLK_LOCAL_MEM_FENCE);
+
+    return *t;
+}
+
+// sub group functions
+
+#define __SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__sub_group_reserve_read_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device); \
+        rid = reserve(&p->read_idx, wi, num_packets); \
+ \
+        if (rid + num_packets == wi) { \
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device); \
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device); \
+        } \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_READ_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__sub_group_reserve_read_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+    size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t wi = atomic_load_explicit(&p->write_idx, memory_order_relaxed, memory_scope_device);
+        rid = reserve(&p->read_idx, wi, num_packets);
+
+        if (rid + num_packets == wi) {
+            atomic_store_explicit(&p->write_idx, 0, memory_order_release, memory_scope_device);
+            atomic_store_explicit(&p->read_idx, 0, memory_order_relaxed, memory_scope_device);
+        }
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+
+#define __SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) size_t \
+__sub_group_reserve_write_pipe_internal_##SIZE(__global struct pipeimp *p, uint num_packets) \
+{ \
+    size_t rid = ~(size_t)0; \
+ \
+    if (get_sub_group_local_id() == 0) { \
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+        size_t ei = p->end_idx; \
+        rid = reserve(&p->write_idx, ri + ei, num_packets); \
+    } \
+ \
+    return sub_group_broadcast(rid, 0); \
+}
+
+DO_PIPE_INTERNAL_SIZE(__SUB_GROUP_RESERVE_WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) size_t
+__sub_group_reserve_write_pipe_internal_user(__global struct pipeimp *p, uint num_packets, size_t size)
+{
+     size_t rid = ~(size_t)0;
+
+    if (get_sub_group_local_id() == 0) {
+        size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+        size_t ei = p->end_idx;
+        rid = reserve(&p->write_idx, ri + ei, num_packets);
+    }
+
+    return sub_group_broadcast(rid, 0);
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/validp.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,32 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+
+__attribute__((always_inline, weak)) bool
+__is_valid_reserve_id(size_t rid)
+{
+    return rid != ~(size_t)0;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/pipes/writep.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,78 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+//
+// Copyright (c) 2014 Advanced Micro Devices, Inc. All rights reserved.
+//
+
+#include "pipes.h"
+
+#define __WRITE_PIPE_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__write_pipe_internal_##SIZE(__global struct pipeimp* p, const STYPE* ptr) \
+{ \
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device); \
+    size_t ei = p->end_idx; \
+    size_t wi = reserve(&p->write_idx, ri+ei, 1); \
+    if (wi == ~(size_t)0) \
+        return -1; \
+ \
+    ((__global STYPE *)p->packets)[wi % ei] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__write_pipe_internal_user(__global struct pipeimp* p, const void* ptr, size_t size, size_t align)
+{
+    size_t ri = atomic_load_explicit(&p->read_idx, memory_order_relaxed, memory_scope_device);
+    size_t ei = p->end_idx;
+    size_t wi = reserve(&p->write_idx, ri+ei, 1);
+    if (wi == ~(size_t)0)
+        return -1;
+
+    __memcpy_internal_aligned(p->packets + (wi % ei)*size, ptr, size, align);
+
+    return 0;
+}
+
+#define __WRITE_PIPE_INDEXED_INTERNAL_SIZE(SIZE, STYPE) \
+__attribute__((weak, always_inline)) int \
+__write_pipe_reserved_internal_##SIZE(__global struct pipeimp* p, size_t rid, uint i, const STYPE* ptr)  \
+{ \
+    rid += i; \
+    ((__global STYPE *)p->packets)[rid % p->end_idx] = *ptr; \
+    return 0; \
+}
+
+DO_PIPE_INTERNAL_SIZE(__WRITE_PIPE_INDEXED_INTERNAL_SIZE)
+
+__attribute__((weak, always_inline)) int
+__write_pipe_reserved_internal_user(__global struct pipeimp* p, size_t rid, uint i, const void *ptr, size_t size, size_t align)
+{
+    rid += i;
+
+    __memcpy_internal_aligned(p->packets + (rid % p->end_idx)*size, ptr, size, align);
+
+    return 0;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subany.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,43 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern __attribute__((pure)) uint __hsail_activelanecount_wavewidth_u32_b1(bool);
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+sub_group_all(int predicate)
+{
+    return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) == __hsail_activelanecount_wavewidth_u32_b1(true);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) int
+sub_group_any(int predicate)
+{
+    return __hsail_activelanecount_wavewidth_u32_b1(predicate != 0) != 0;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subbar.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,40 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern void __hsail_wavebarrier(void);
+
+__attribute__((overloadable,weak,always_inline)) void
+sub_group_barrier(cl_mem_fence_flags flags)
+{
+    sub_group_barrier(flags, memory_scope_sub_group);
+}
+
+__attribute__((overloadable,weak,always_inline)) void
+sub_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
+{
+    // What about CLK_IMAGE_MEM_FENCE
+    atomic_work_item_fence(flags, memory_order_release, scope);
+    __hsail_wavebarrier();
+    atomic_work_item_fence(flags, memory_order_acquire, scope);
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subbcast.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,52 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+__attribute__((always_inline)) static uint
+bcast32(uint a, uint lid)
+{
+    a = __hsail_activelaneshuffle_wavewidth_b32(a, lid, 0U, false);
+    __hsail_wavebarrier();
+    return a;
+}
+
+extern __attribute__((overloadable, alias("bcast32"))) uint sub_group_broadcast(uint, uint);
+extern __attribute__((overloadable, alias("bcast32"))) int sub_group_broadcast(int, uint);
+extern __attribute__((overloadable, alias("bcast32"))) float sub_group_broadcast(float, uint);
+
+
+__attribute__((always_inline)) static ulong
+bcast64(ulong a, uint lid)
+{
+    a = __hsail_activelaneshuffle_wavewidth_b64(a, lid, 0UL, false);
+    __hsail_wavebarrier();
+    return a;
+}
+
+extern __attribute__((overloadable, alias("bcast64"))) ulong sub_group_broadcast(ulong, uint);
+extern __attribute__((overloadable, alias("bcast64"))) long sub_group_broadcast(long, uint);
+extern __attribute__((overloadable, alias("bcast64"))) double sub_group_broadcast(double, uint);
+

Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subget.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,84 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern __attribute__((pure)) uint __hsail_workitemid_flat(void);
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_size(void)
+{
+    uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2);
+    uint lid = (uint)get_local_linear_id();
+    return min(64U, wgs - (lid & ~63U));
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_max_sub_group_size(void)
+{
+    uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2);
+    return min(64U, wgs);
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_num_sub_groups(void)
+{
+    uint wgs = (uint)get_local_size(0) * (uint)get_local_size(1) * (uint)get_local_size(2);
+    return (wgs + 63U) >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_enqueued_num_sub_groups(void)
+{
+    uint wgs = (uint)get_enqueued_local_size(0) * get_enqueued_local_size(1) * get_enqueued_local_size(2);
+    return (wgs + 63U) >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_id(void)
+{
+    return __hsail_workitemid_flat() >> 6U;
+}
+
+#ifdef __clang__
+__attribute__((overloadable))
+#endif
+__attribute__((always_inline)) uint
+get_sub_group_local_id(void)
+{
+    return __hsail_workitemid_flat() & 0x3fU;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subreduce.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,94 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_get_lane_id(void);
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+#define GENA(TY,SZ,AO,AI,Z) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_reduce_add(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(Z), false)); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+GENA(int,32,as_int,as_uint,0)
+GENA(uint,32,,,0U)
+GENA(long,64,as_long,as_ulong,0L)
+GENA(ulong,64,,,0UL)
+GENA(float,32,as_float,as_uint,0.0f)
+GENA(double,64,as_double,as_ulong,0.0)
+
+#define GENO(TY,SZ,OP,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_reduce_##OP(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^1, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^2, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^4, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^8, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^16, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), lid^32, AI(ID), false))); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+GENO(int,32,min,as_int,as_uint,INT_MAX)
+GENO(uint,32,min,,,UINT_MAX)
+GENO(long,64,min,as_long,as_ulong,LONG_MAX)
+GENO(ulong,64,min,,,ULONG_MAX)
+GENO(float,32,min,as_float,as_uint,INFINITY)
+GENO(double,64,min,as_double,as_ulong,(double)INFINITY)
+
+GENO(int,32,max,as_int,as_uint,INT_MIN)
+GENO(uint,32,max,,,0U)
+GENO(long,64,max,as_long,as_ulong,LONG_MIN)
+GENO(ulong,64,max,,,0UL)
+GENO(float,32,max,as_float,as_uint,-INFINITY)
+GENO(double,64,max,as_double,as_ulong,-(double)INFINITY)
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/subgroup/subscan.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,130 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#pragma OPENCL EXTENSION cl_khr_subgroups : enable
+
+extern uint __hsail_get_lane_id(void);
+extern uint __hsail_activelaneshuffle_wavewidth_b32(uint src, uint lid, uint ival, bool useival);
+extern ulong __hsail_activelaneshuffle_wavewidth_b64(ulong src, uint lid, ulong ival, bool useival);
+extern void __hsail_wavebarrier();
+
+// Define exclusive in terms of inclusive
+
+#define EGEN(TY,OP,SZ,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_exclusive_##OP(TY a) \
+{ \
+    a = sub_group_scan_inclusive_##OP(a); \
+    uint lid = __hsail_get_lane_id(); \
+    a = AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \
+    return a; \
+}
+
+EGEN(int,add,32,as_int,as_uint,0)
+EGEN(int,min,32,as_int,as_uint,INT_MAX)
+EGEN(int,max,32,as_int,as_uint,INT_MIN)
+
+EGEN(uint,add,32,,,0)
+EGEN(uint,min,32,,,UINT_MAX)
+EGEN(uint,max,32,,,0U)
+
+EGEN(long,add,64,as_long,as_ulong,0L)
+EGEN(long,min,64,as_long,as_ulong,LONG_MAX)
+EGEN(long,max,64,as_long,as_ulong,LONG_MIN)
+
+EGEN(ulong,add,64,,,0UL)
+EGEN(ulong,min,64,,,ULONG_MAX)
+EGEN(ulong,max,64,,,0UL)
+
+EGEN(float,add,32,as_float,as_uint,0.0f)
+EGEN(float,min,32,as_float,as_uint,INFINITY)
+EGEN(float,max,32,as_float,as_uint,-INFINITY)
+
+EGEN(double,add,64,as_double,as_ulong,0.0)
+EGEN(double,min,64,as_double,as_ulong,(double)INFINITY)
+EGEN(double,max,64,as_double,as_ulong,-(double)INFINITY)
+
+// Now inclusive scan
+
+#define IGENA(TY,SZ,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_inclusive_add(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16)); \
+    __hsail_wavebarrier(); \
+    a += AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32)); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+#define IGENO(TY,SZ,OP,AO,AI,ID) \
+__attribute__((overloadable, always_inline)) TY \
+sub_group_scan_inclusive_##OP(TY a) \
+{ \
+    uint lid = __hsail_get_lane_id(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-1)&0x3f, AI(ID), lid < 1))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-2)&0x3f, AI(ID), lid < 2))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-4)&0x3f, AI(ID), lid < 4))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-8)&0x3f, AI(ID), lid < 8))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-16)&0x3f, AI(ID), lid < 16))); \
+    __hsail_wavebarrier(); \
+    a = OP(a, AO(__hsail_activelaneshuffle_wavewidth_b##SZ(AI(a), (lid-32)&0x3f, AI(ID), lid < 32))); \
+    __hsail_wavebarrier(); \
+    return a; \
+}
+
+IGENA(int,32,as_int,as_uint,0)
+IGENO(int,32,min,as_int,as_uint,INT_MAX)
+IGENO(int,32,max,as_int,as_uint,INT_MIN)
+
+IGENA(uint,32,,,0U)
+IGENO(uint,32,min,,,UINT_MAX)
+IGENO(uint,32,max,,,0U)
+
+IGENA(long,64,as_long,as_ulong,0L)
+IGENO(long,64,min,as_long,as_ulong,LONG_MAX)
+IGENO(long,64,max,as_long,as_ulong,LONG_MIN)
+
+IGENA(ulong,64,,,0UL)
+IGENO(ulong,64,min,,,ULONG_MAX)
+IGENO(ulong,64,max,,,0UL)
+
+IGENA(float,32,as_float,as_uint,0.0f)
+IGENO(float,32,min,as_float,as_uint,INFINITY)
+IGENO(float,32,max,as_float,as_uint,-INFINITY)
+
+IGENA(double,64,as_double,as_ulong,0.0)
+IGENO(double,64,min,as_double,as_ulong,(double)INFINITY)
+IGENO(double,64,max,as_double,as_ulong,-(double)INFINITY)
+

Added: libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/f16_f32.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,330 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern float __amdil_half_to_float_f32(uint op1);
+
+extern float __amdil_float_to_half_f32(float op1);
+extern float __amdil_float_to_half_near_f32(float op1);
+extern float __amdil_float_to_half_neg_inf_f32(float op1);
+extern float __amdil_float_to_half_plus_inf_f32(float op1);
+
+// half -> float
+__attribute__((always_inline)) float
+__cvt_f16_to_f32(ushort a)
+{
+    return __amdil_half_to_float_f32((uint)a);
+}
+
+__attribute__((always_inline)) float2
+__cvt_2f16_to_2f32(ushort2 ush)
+{
+    float2 ret;
+    ret.s0 = __cvt_f16_to_f32(ush.s0);
+    ret.s1 = __cvt_f16_to_f32(ush.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) float3
+__cvt_3f16_to_3f32(ushort3 ush)
+{
+    float3 ret;
+    ret.lo = __cvt_2f16_to_2f32(ush.lo);
+    ret.s2 = __cvt_f16_to_f32(ush.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) float4
+__cvt_4f16_to_4f32(ushort4 ush)
+{
+    float4 ret;
+    ret.lo = __cvt_2f16_to_2f32(ush.lo);
+    ret.hi = __cvt_2f16_to_2f32(ush.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) float8
+__cvt_8f16_to_8f32(ushort8 ush)
+{
+    float8 ret;
+    ret.lo = __cvt_4f16_to_4f32(ush.lo);
+    ret.hi = __cvt_4f16_to_4f32(ush.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) float16
+__cvt_16f16_to_16f32(ushort16 ush)
+{
+    float16 ret;
+    ret.lo = __cvt_8f16_to_8f32(ush.lo);
+    ret.hi = __cvt_8f16_to_8f32(ush.hi);
+    return ret;
+}
+
+// float -> half rte
+__attribute__((always_inline)) ushort
+__cvt_f32_to_f16_rte(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_near_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rte(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rte(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rte(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rte(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rte(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rte(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rte(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rte(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rte(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rte(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rte(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rte(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rte(f.hi);
+    return ret;
+}
+
+// float -> half cur
+// XXX assumes RTE
+__attribute__((always_inline)) ushort
+__cvt_f32_to_f16_cur(float f)
+{
+    return __cvt_f32_to_f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_cur(float2 f)
+{
+    return __cvt_2f32_to_2f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_cur(float3 f)
+{
+    return __cvt_3f32_to_3f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_cur(float4 f)
+{
+    return __cvt_4f32_to_4f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_cur(float8 f)
+{
+    return __cvt_8f32_to_8f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_cur(float16 f)
+{
+    return __cvt_16f32_to_16f16_rte(f);
+}
+
+//float -> half rtp
+
+ushort
+__cvt_f32_to_f16_rtp(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_plus_inf_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtp(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rtp(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rtp(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtp(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtp(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rtp(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtp(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtp(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtp(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rtp(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtp(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rtp(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rtp(f.hi);
+    return ret;
+}
+
+// float -> half rtn
+
+ushort
+__cvt_f32_to_f16_rtn(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_neg_inf_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtn(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rtn(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rtn(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtn(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtn(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rtn(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtn(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtn(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtn(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rtn(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtn(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rtn(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rtn(f.hi);
+    return ret;
+}
+
+// float -> half rtz
+
+ushort
+__cvt_f32_to_f16_rtz(float a)
+{
+    return (ushort)as_uint(__amdil_float_to_half_f32(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f32_to_2f16_rtz(float2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f32_to_f16_rtz(f.s0);
+    ret.s1 = __cvt_f32_to_f16_rtz(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f32_to_3f16_rtz(float3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtz(f.lo);
+    ret.s2 = __cvt_f32_to_f16_rtz(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f32_to_4f16_rtz(float4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f32_to_2f16_rtz(f.lo);
+    ret.hi = __cvt_2f32_to_2f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f32_to_8f16_rtz(float8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f32_to_4f16_rtz(f.lo);
+    ret.hi = __cvt_4f32_to_4f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f32_to_16f16_rtz(float16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f32_to_8f16_rtz(f.lo);
+    ret.hi = __cvt_8f32_to_8f16_rtz(f.hi);
+    return ret;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/f64_f16.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,276 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+extern float __amdil_double_to_half_f64(double op1);
+extern float __amdil_double_to_half_near_f64(double op1);
+extern float __amdil_double_to_half_neg_inf_f64(double op1);
+extern float __amdil_double_to_half_plus_inf_f64(double op1);
+
+// double -> half rte
+__attribute__((always_inline)) ushort
+__cvt_f64_to_f16_rte(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_near_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rte(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rte(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rte(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rte(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rte(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rte(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rte(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rte(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rte(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rte(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rte(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rte(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rte(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rte(f.hi);
+    return ret;
+}
+
+// double -> half cur
+// XXX assumes RTE
+__attribute__((always_inline)) ushort
+__cvt_f64_to_f16_cur(double f)
+{
+    return __cvt_f64_to_f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_cur(double2 f)
+{
+    return __cvt_2f64_to_2f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_cur(double3 f)
+{
+    return __cvt_3f64_to_3f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_cur(double4 f)
+{
+    return __cvt_4f64_to_4f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_cur(double8 f)
+{
+    return __cvt_8f64_to_8f16_rte(f);
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_cur(double16 f)
+{
+    return __cvt_16f64_to_16f16_rte(f);
+}
+
+//double -> half rtp
+
+ushort
+__cvt_f64_to_f16_rtp(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_plus_inf_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtp(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rtp(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rtp(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtp(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtp(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rtp(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtp(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtp(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtp(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rtp(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rtp(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtp(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rtp(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rtp(f.hi);
+    return ret;
+}
+
+// double -> half rtn
+
+ushort
+__cvt_f64_to_f16_rtn(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_neg_inf_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtn(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rtn(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rtn(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtn(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtn(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rtn(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtn(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtn(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtn(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rtn(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rtn(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtn(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rtn(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rtn(f.hi);
+    return ret;
+}
+
+// double -> half rtz
+
+ushort
+__cvt_f64_to_f16_rtz(double a)
+{
+    return (ushort)as_uint(__amdil_double_to_half_f64(a));
+}
+
+__attribute__((always_inline)) ushort2
+__cvt_2f64_to_2f16_rtz(double2 f)
+{
+    ushort2 ret;
+    ret.s0 = __cvt_f64_to_f16_rtz(f.s0);
+    ret.s1 = __cvt_f64_to_f16_rtz(f.s1);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort3
+__cvt_3f64_to_3f16_rtz(double3 f)
+{
+    ushort3 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtz(f.lo);
+    ret.s2 = __cvt_f64_to_f16_rtz(f.s2);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort4
+__cvt_4f64_to_4f16_rtz(double4 f)
+{
+    ushort4 ret;
+    ret.lo = __cvt_2f64_to_2f16_rtz(f.lo);
+    ret.hi = __cvt_2f64_to_2f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort8
+__cvt_8f64_to_8f16_rtz(double8 f)
+{
+    ushort8 ret;
+    ret.lo = __cvt_4f64_to_4f16_rtz(f.lo);
+    ret.hi = __cvt_4f64_to_4f16_rtz(f.hi);
+    return ret;
+}
+
+__attribute__((always_inline)) ushort16
+__cvt_16f64_to_16f16_rtz(double16 f)
+{
+    ushort16 ret;
+    ret.lo = __cvt_8f64_to_8f16_rtz(f.lo);
+    ret.hi = __cvt_8f64_to_8f16_rtz(f.hi);
+    return ret;
+}
+

Added: libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/vldst_gen.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,3206 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const float *p)
+{
+    return as_float2(vload2(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __constant float *p)
+{
+    return as_float2(vload2(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __global float *p)
+{
+    return as_float2(vload2(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float2
+vload2(size_t i, const __local float *p)
+{
+    return as_float2(vload2(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const double *p)
+{
+    return as_double2(vload2(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __constant double *p)
+{
+    return as_double2(vload2(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __global double *p)
+{
+    return as_double2(vload2(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double2
+vload2(size_t i, const __local double *p)
+{
+    return as_double2(vload2(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const float *p)
+{
+    return as_float3(vload3(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __constant float *p)
+{
+    return as_float3(vload3(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __global float *p)
+{
+    return as_float3(vload3(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float3
+vload3(size_t i, const __local float *p)
+{
+    return as_float3(vload3(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const double *p)
+{
+    return as_double3(vload3(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __constant double *p)
+{
+    return as_double3(vload3(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __global double *p)
+{
+    return as_double3(vload3(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double3
+vload3(size_t i, const __local double *p)
+{
+    return as_double3(vload3(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const float *p)
+{
+    return as_float4(vload4(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __constant float *p)
+{
+    return as_float4(vload4(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __global float *p)
+{
+    return as_float4(vload4(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float4
+vload4(size_t i, const __local float *p)
+{
+    return as_float4(vload4(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const double *p)
+{
+    return as_double4(vload4(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __constant double *p)
+{
+    return as_double4(vload4(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __global double *p)
+{
+    return as_double4(vload4(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double4
+vload4(size_t i, const __local double *p)
+{
+    return as_double4(vload4(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const float *p)
+{
+    return as_float8(vload8(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __constant float *p)
+{
+    return as_float8(vload8(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __global float *p)
+{
+    return as_float8(vload8(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float8
+vload8(size_t i, const __local float *p)
+{
+    return as_float8(vload8(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const double *p)
+{
+    return as_double8(vload8(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __constant double *p)
+{
+    return as_double8(vload8(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __global double *p)
+{
+    return as_double8(vload8(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double8
+vload8(size_t i, const __local double *p)
+{
+    return as_double8(vload8(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const float *p)
+{
+    return as_float16(vload16(i, (const int *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __constant float *p)
+{
+    return as_float16(vload16(i, (const __constant int *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __global float *p)
+{
+    return as_float16(vload16(i, (const __global int *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) float16
+vload16(size_t i, const __local float *p)
+{
+    return as_float16(vload16(i, (const __local int *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const double *p)
+{
+    return as_double16(vload16(i, (const long *)p));
+}
+
+
+
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __constant double *p)
+{
+    return as_double16(vload16(i, (const __constant long *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __global double *p)
+{
+    return as_double16(vload16(i, (const __global long *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) double16
+vload16(size_t i, const __local double *p)
+{
+    return as_double16(vload16(i, (const __local long *)p));
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, float *p)
+{
+    vstore2(as_int2(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, __global float *p)
+{
+    vstore2(as_int2(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(float2 v, size_t i, __local float *p)
+{
+    vstore2(as_int2(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, double *p)
+{
+    vstore2(as_long2(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, __global double *p)
+{
+    vstore2(as_long2(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore2(double2 v, size_t i, __local double *p)
+{
+    vstore2(as_long2(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, float *p)
+{
+    vstore3(as_int3(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, __global float *p)
+{
+    vstore3(as_int3(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(float3 v, size_t i, __local float *p)
+{
+    vstore3(as_int3(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, double *p)
+{
+    vstore3(as_long3(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, __global double *p)
+{
+    vstore3(as_long3(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore3(double3 v, size_t i, __local double *p)
+{
+    vstore3(as_long3(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, float *p)
+{
+    vstore4(as_int4(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, __global float *p)
+{
+    vstore4(as_int4(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(float4 v, size_t i, __local float *p)
+{
+    vstore4(as_int4(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, double *p)
+{
+    vstore4(as_long4(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, __global double *p)
+{
+    vstore4(as_long4(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore4(double4 v, size_t i, __local double *p)
+{
+    vstore4(as_long4(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, float *p)
+{
+    vstore8(as_int8(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, __global float *p)
+{
+    vstore8(as_int8(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(float8 v, size_t i, __local float *p)
+{
+    vstore8(as_int8(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, double *p)
+{
+    vstore8(as_long8(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, __global double *p)
+{
+    vstore8(as_long8(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore8(double8 v, size_t i, __local double *p)
+{
+    vstore8(as_long8(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, float *p)
+{
+    vstore16(as_int16(v), i, (int *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, __global float *p)
+{
+    vstore16(as_int16(v), i, (__global int *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(float16 v, size_t i, __local float *p)
+{
+    vstore16(as_int16(v), i, (__local int *)p);
+}
+#endif
+
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, double *p)
+{
+    vstore16(as_long16(v), i, (long *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, __global double *p)
+{
+    vstore16(as_long16(v), i, (__global long *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((overloadable, always_inline, weak)) void
+vstore16(double16 v, size_t i, __local double *p)
+{
+    vstore16(as_long16(v), i, (__local long *)p);
+}
+#endif
+
+
+__attribute__((always_inline)) static char2
+vldp12(size_t i, const char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp12")))  char2 vload2(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp12"))) uchar2 vload2(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char2
+vldc12(size_t i, const __constant char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc12")))  char2 vload2(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc12"))) uchar2 vload2(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char2
+vldg12(size_t i, const __global char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg12")))  char2 vload2(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg12"))) uchar2 vload2(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char2
+vldl12(size_t i, const __local char *p)
+{
+    char2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl12")))  char2 vload2(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl12"))) uchar2 vload2(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short2
+vldp22(size_t i, const short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp22")))  short2 vload2(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp22"))) ushort2 vload2(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short2
+vldc22(size_t i, const __constant short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc22")))  short2 vload2(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc22"))) ushort2 vload2(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short2
+vldg22(size_t i, const __global short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg22")))  short2 vload2(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg22"))) ushort2 vload2(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short2
+vldl22(size_t i, const __local short *p)
+{
+    short2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl22")))  short2 vload2(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl22"))) ushort2 vload2(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int2
+vldp42(size_t i, const int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp42")))  int2 vload2(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp42"))) uint2 vload2(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int2
+vldc42(size_t i, const __constant int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc42")))  int2 vload2(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc42"))) uint2 vload2(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int2
+vldg42(size_t i, const __global int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg42")))  int2 vload2(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg42"))) uint2 vload2(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int2
+vldl42(size_t i, const __local int *p)
+{
+    int2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl42")))  int2 vload2(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl42"))) uint2 vload2(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long2
+vldp82(size_t i, const long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp82")))  long2 vload2(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp82"))) ulong2 vload2(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long2
+vldc82(size_t i, const __constant long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc82")))  long2 vload2(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc82"))) ulong2 vload2(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long2
+vldg82(size_t i, const __global long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg82")))  long2 vload2(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg82"))) ulong2 vload2(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long2
+vldl82(size_t i, const __local long *p)
+{
+    long2 ret;
+    p += i * 2;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl82")))  long2 vload2(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl82"))) ulong2 vload2(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char3
+vldp13(size_t i, const char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp13")))  char3 vload3(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp13"))) uchar3 vload3(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char3
+vldc13(size_t i, const __constant char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc13")))  char3 vload3(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc13"))) uchar3 vload3(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char3
+vldg13(size_t i, const __global char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg13")))  char3 vload3(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg13"))) uchar3 vload3(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char3
+vldl13(size_t i, const __local char *p)
+{
+    char3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl13")))  char3 vload3(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl13"))) uchar3 vload3(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short3
+vldp23(size_t i, const short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp23")))  short3 vload3(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp23"))) ushort3 vload3(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short3
+vldc23(size_t i, const __constant short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc23")))  short3 vload3(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc23"))) ushort3 vload3(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short3
+vldg23(size_t i, const __global short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg23")))  short3 vload3(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg23"))) ushort3 vload3(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short3
+vldl23(size_t i, const __local short *p)
+{
+    short3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl23")))  short3 vload3(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl23"))) ushort3 vload3(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int3
+vldp43(size_t i, const int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp43")))  int3 vload3(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp43"))) uint3 vload3(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int3
+vldc43(size_t i, const __constant int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc43")))  int3 vload3(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc43"))) uint3 vload3(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int3
+vldg43(size_t i, const __global int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg43")))  int3 vload3(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg43"))) uint3 vload3(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int3
+vldl43(size_t i, const __local int *p)
+{
+    int3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl43")))  int3 vload3(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl43"))) uint3 vload3(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long3
+vldp83(size_t i, const long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp83")))  long3 vload3(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp83"))) ulong3 vload3(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long3
+vldc83(size_t i, const __constant long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc83")))  long3 vload3(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc83"))) ulong3 vload3(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long3
+vldg83(size_t i, const __global long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg83")))  long3 vload3(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg83"))) ulong3 vload3(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long3
+vldl83(size_t i, const __local long *p)
+{
+    long3 ret;
+    p += i * 3;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl83")))  long3 vload3(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl83"))) ulong3 vload3(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char4
+vldp14(size_t i, const char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp14")))  char4 vload4(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp14"))) uchar4 vload4(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char4
+vldc14(size_t i, const __constant char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc14")))  char4 vload4(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc14"))) uchar4 vload4(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char4
+vldg14(size_t i, const __global char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg14")))  char4 vload4(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg14"))) uchar4 vload4(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char4
+vldl14(size_t i, const __local char *p)
+{
+    char4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl14")))  char4 vload4(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl14"))) uchar4 vload4(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short4
+vldp24(size_t i, const short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp24")))  short4 vload4(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp24"))) ushort4 vload4(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short4
+vldc24(size_t i, const __constant short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc24")))  short4 vload4(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc24"))) ushort4 vload4(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short4
+vldg24(size_t i, const __global short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg24")))  short4 vload4(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg24"))) ushort4 vload4(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short4
+vldl24(size_t i, const __local short *p)
+{
+    short4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl24")))  short4 vload4(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl24"))) ushort4 vload4(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int4
+vldp44(size_t i, const int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp44")))  int4 vload4(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp44"))) uint4 vload4(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int4
+vldc44(size_t i, const __constant int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc44")))  int4 vload4(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc44"))) uint4 vload4(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int4
+vldg44(size_t i, const __global int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg44")))  int4 vload4(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg44"))) uint4 vload4(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int4
+vldl44(size_t i, const __local int *p)
+{
+    int4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl44")))  int4 vload4(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl44"))) uint4 vload4(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long4
+vldp84(size_t i, const long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp84")))  long4 vload4(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp84"))) ulong4 vload4(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long4
+vldc84(size_t i, const __constant long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc84")))  long4 vload4(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc84"))) ulong4 vload4(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long4
+vldg84(size_t i, const __global long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg84")))  long4 vload4(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg84"))) ulong4 vload4(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long4
+vldl84(size_t i, const __local long *p)
+{
+    long4 ret;
+    p += i * 4;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl84")))  long4 vload4(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl84"))) ulong4 vload4(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char8
+vldp18(size_t i, const char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp18")))  char8 vload8(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp18"))) uchar8 vload8(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char8
+vldc18(size_t i, const __constant char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc18")))  char8 vload8(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc18"))) uchar8 vload8(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char8
+vldg18(size_t i, const __global char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg18")))  char8 vload8(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg18"))) uchar8 vload8(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char8
+vldl18(size_t i, const __local char *p)
+{
+    char8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl18")))  char8 vload8(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl18"))) uchar8 vload8(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short8
+vldp28(size_t i, const short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp28")))  short8 vload8(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp28"))) ushort8 vload8(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short8
+vldc28(size_t i, const __constant short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc28")))  short8 vload8(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc28"))) ushort8 vload8(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short8
+vldg28(size_t i, const __global short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg28")))  short8 vload8(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg28"))) ushort8 vload8(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short8
+vldl28(size_t i, const __local short *p)
+{
+    short8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl28")))  short8 vload8(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl28"))) ushort8 vload8(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int8
+vldp48(size_t i, const int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp48")))  int8 vload8(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp48"))) uint8 vload8(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int8
+vldc48(size_t i, const __constant int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc48")))  int8 vload8(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc48"))) uint8 vload8(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int8
+vldg48(size_t i, const __global int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg48")))  int8 vload8(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg48"))) uint8 vload8(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int8
+vldl48(size_t i, const __local int *p)
+{
+    int8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl48")))  int8 vload8(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl48"))) uint8 vload8(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long8
+vldp88(size_t i, const long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp88")))  long8 vload8(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp88"))) ulong8 vload8(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long8
+vldc88(size_t i, const __constant long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc88")))  long8 vload8(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc88"))) ulong8 vload8(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long8
+vldg88(size_t i, const __global long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg88")))  long8 vload8(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg88"))) ulong8 vload8(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long8
+vldl88(size_t i, const __local long *p)
+{
+    long8 ret;
+    p += i * 8;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl88")))  long8 vload8(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl88"))) ulong8 vload8(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static char16
+vldp116(size_t i, const char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp116")))  char16 vload16(size_t, const  char *);
+extern __attribute__((overloadable, weak, alias("vldp116"))) uchar16 vload16(size_t, const uchar *);
+
+
+
+__attribute__((always_inline)) static char16
+vldc116(size_t i, const __constant char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc116")))  char16 vload16(size_t, const __constant  char *);
+extern __attribute__((overloadable, weak, alias("vldc116"))) uchar16 vload16(size_t, const __constant uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char16
+vldg116(size_t i, const __global char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg116")))  char16 vload16(size_t, const __global  char *);
+extern __attribute__((overloadable, weak, alias("vldg116"))) uchar16 vload16(size_t, const __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static char16
+vldl116(size_t i, const __local char *p)
+{
+    char16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl116")))  char16 vload16(size_t, const __local  char *);
+extern __attribute__((overloadable, weak, alias("vldl116"))) uchar16 vload16(size_t, const __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static short16
+vldp216(size_t i, const short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp216")))  short16 vload16(size_t, const  short *);
+extern __attribute__((overloadable, weak, alias("vldp216"))) ushort16 vload16(size_t, const ushort *);
+
+
+
+__attribute__((always_inline)) static short16
+vldc216(size_t i, const __constant short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc216")))  short16 vload16(size_t, const __constant  short *);
+extern __attribute__((overloadable, weak, alias("vldc216"))) ushort16 vload16(size_t, const __constant ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short16
+vldg216(size_t i, const __global short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg216")))  short16 vload16(size_t, const __global  short *);
+extern __attribute__((overloadable, weak, alias("vldg216"))) ushort16 vload16(size_t, const __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static short16
+vldl216(size_t i, const __local short *p)
+{
+    short16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl216")))  short16 vload16(size_t, const __local  short *);
+extern __attribute__((overloadable, weak, alias("vldl216"))) ushort16 vload16(size_t, const __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static int16
+vldp416(size_t i, const int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp416")))  int16 vload16(size_t, const  int *);
+extern __attribute__((overloadable, weak, alias("vldp416"))) uint16 vload16(size_t, const uint *);
+
+
+
+__attribute__((always_inline)) static int16
+vldc416(size_t i, const __constant int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc416")))  int16 vload16(size_t, const __constant  int *);
+extern __attribute__((overloadable, weak, alias("vldc416"))) uint16 vload16(size_t, const __constant uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int16
+vldg416(size_t i, const __global int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg416")))  int16 vload16(size_t, const __global  int *);
+extern __attribute__((overloadable, weak, alias("vldg416"))) uint16 vload16(size_t, const __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static int16
+vldl416(size_t i, const __local int *p)
+{
+    int16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl416")))  int16 vload16(size_t, const __local  int *);
+extern __attribute__((overloadable, weak, alias("vldl416"))) uint16 vload16(size_t, const __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static long16
+vldp816(size_t i, const long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldp816")))  long16 vload16(size_t, const  long *);
+extern __attribute__((overloadable, weak, alias("vldp816"))) ulong16 vload16(size_t, const ulong *);
+
+
+
+__attribute__((always_inline)) static long16
+vldc816(size_t i, const __constant long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldc816")))  long16 vload16(size_t, const __constant  long *);
+extern __attribute__((overloadable, weak, alias("vldc816"))) ulong16 vload16(size_t, const __constant ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long16
+vldg816(size_t i, const __global long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldg816")))  long16 vload16(size_t, const __global  long *);
+extern __attribute__((overloadable, weak, alias("vldg816"))) ulong16 vload16(size_t, const __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static long16
+vldl816(size_t i, const __local long *p)
+{
+    long16 ret;
+    p += i * 16;
+    ret.s0 = p[0];
+    ret.s1 = p[1];
+    ret.s2 = p[2];
+    ret.s3 = p[3];
+    ret.s4 = p[4];
+    ret.s5 = p[5];
+    ret.s6 = p[6];
+    ret.s7 = p[7];
+    ret.s8 = p[8];
+    ret.s9 = p[9];
+    ret.sa = p[10];
+    ret.sb = p[11];
+    ret.sc = p[12];
+    ret.sd = p[13];
+    ret.se = p[14];
+    ret.sf = p[15];
+
+    return ret;
+}
+extern __attribute__((overloadable, weak, alias("vldl816")))  long16 vload16(size_t, const __local  long *);
+extern __attribute__((overloadable, weak, alias("vldl816"))) ulong16 vload16(size_t, const __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp12(char2 v, size_t i, char *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2( char2, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp12"))) void vstore2(uchar2, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg12(char2 v, size_t i, __global char *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2( char2, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg12"))) void vstore2(uchar2, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl12(char2 v, size_t i, __local char *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2( char2, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl12"))) void vstore2(uchar2, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp22(short2 v, size_t i, short *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2( short2, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp22"))) void vstore2(ushort2, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg22(short2 v, size_t i, __global short *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2( short2, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg22"))) void vstore2(ushort2, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl22(short2 v, size_t i, __local short *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2( short2, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl22"))) void vstore2(ushort2, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp42(int2 v, size_t i, int *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2( int2, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp42"))) void vstore2(uint2, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg42(int2 v, size_t i, __global int *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2( int2, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg42"))) void vstore2(uint2, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl42(int2 v, size_t i, __local int *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2( int2, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl42"))) void vstore2(uint2, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp82(long2 v, size_t i, long *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2( long2, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp82"))) void vstore2(ulong2, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg82(long2 v, size_t i, __global long *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2( long2, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg82"))) void vstore2(ulong2, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl82(long2 v, size_t i, __local long *p)
+{
+    p += i * 2;
+    p[0] = v.s0;
+    p[1] = v.s1;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2( long2, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl82"))) void vstore2(ulong2, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp13(char3 v, size_t i, char *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3( char3, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp13"))) void vstore3(uchar3, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg13(char3 v, size_t i, __global char *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3( char3, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg13"))) void vstore3(uchar3, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl13(char3 v, size_t i, __local char *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3( char3, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl13"))) void vstore3(uchar3, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp23(short3 v, size_t i, short *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3( short3, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp23"))) void vstore3(ushort3, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg23(short3 v, size_t i, __global short *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3( short3, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg23"))) void vstore3(ushort3, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl23(short3 v, size_t i, __local short *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3( short3, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl23"))) void vstore3(ushort3, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp43(int3 v, size_t i, int *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3( int3, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp43"))) void vstore3(uint3, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg43(int3 v, size_t i, __global int *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3( int3, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg43"))) void vstore3(uint3, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl43(int3 v, size_t i, __local int *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3( int3, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl43"))) void vstore3(uint3, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp83(long3 v, size_t i, long *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3( long3, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp83"))) void vstore3(ulong3, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg83(long3 v, size_t i, __global long *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3( long3, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg83"))) void vstore3(ulong3, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl83(long3 v, size_t i, __local long *p)
+{
+    p += i * 3;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3( long3, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl83"))) void vstore3(ulong3, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp14(char4 v, size_t i, char *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4( char4, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp14"))) void vstore4(uchar4, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg14(char4 v, size_t i, __global char *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4( char4, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg14"))) void vstore4(uchar4, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl14(char4 v, size_t i, __local char *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4( char4, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl14"))) void vstore4(uchar4, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp24(short4 v, size_t i, short *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4( short4, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp24"))) void vstore4(ushort4, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg24(short4 v, size_t i, __global short *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4( short4, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg24"))) void vstore4(ushort4, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl24(short4 v, size_t i, __local short *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4( short4, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl24"))) void vstore4(ushort4, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp44(int4 v, size_t i, int *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4( int4, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp44"))) void vstore4(uint4, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg44(int4 v, size_t i, __global int *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4( int4, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg44"))) void vstore4(uint4, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl44(int4 v, size_t i, __local int *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4( int4, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl44"))) void vstore4(uint4, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp84(long4 v, size_t i, long *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4( long4, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp84"))) void vstore4(ulong4, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg84(long4 v, size_t i, __global long *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4( long4, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg84"))) void vstore4(ulong4, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl84(long4 v, size_t i, __local long *p)
+{
+    p += i * 4;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4( long4, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl84"))) void vstore4(ulong4, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp18(char8 v, size_t i, char *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8( char8, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp18"))) void vstore8(uchar8, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg18(char8 v, size_t i, __global char *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8( char8, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg18"))) void vstore8(uchar8, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl18(char8 v, size_t i, __local char *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8( char8, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl18"))) void vstore8(uchar8, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp28(short8 v, size_t i, short *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8( short8, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp28"))) void vstore8(ushort8, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg28(short8 v, size_t i, __global short *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8( short8, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg28"))) void vstore8(ushort8, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl28(short8 v, size_t i, __local short *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8( short8, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl28"))) void vstore8(ushort8, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp48(int8 v, size_t i, int *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8( int8, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp48"))) void vstore8(uint8, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg48(int8 v, size_t i, __global int *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8( int8, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg48"))) void vstore8(uint8, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl48(int8 v, size_t i, __local int *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8( int8, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl48"))) void vstore8(uint8, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp88(long8 v, size_t i, long *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8( long8, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp88"))) void vstore8(ulong8, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg88(long8 v, size_t i, __global long *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8( long8, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg88"))) void vstore8(ulong8, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl88(long8 v, size_t i, __local long *p)
+{
+    p += i * 8;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8( long8, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl88"))) void vstore8(ulong8, size_t, __local ulong *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp116(char16 v, size_t i, char *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16( char16, size_t,  char *);
+extern __attribute__((overloadable, weak, alias("vstp116"))) void vstore16(uchar16, size_t, uchar *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg116(char16 v, size_t i, __global char *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16( char16, size_t, __global  char *);
+extern __attribute__((overloadable, weak, alias("vstg116"))) void vstore16(uchar16, size_t, __global uchar *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl116(char16 v, size_t i, __local char *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16( char16, size_t, __local  char *);
+extern __attribute__((overloadable, weak, alias("vstl116"))) void vstore16(uchar16, size_t, __local uchar *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp216(short16 v, size_t i, short *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16( short16, size_t,  short *);
+extern __attribute__((overloadable, weak, alias("vstp216"))) void vstore16(ushort16, size_t, ushort *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg216(short16 v, size_t i, __global short *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16( short16, size_t, __global  short *);
+extern __attribute__((overloadable, weak, alias("vstg216"))) void vstore16(ushort16, size_t, __global ushort *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl216(short16 v, size_t i, __local short *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16( short16, size_t, __local  short *);
+extern __attribute__((overloadable, weak, alias("vstl216"))) void vstore16(ushort16, size_t, __local ushort *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp416(int16 v, size_t i, int *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16( int16, size_t,  int *);
+extern __attribute__((overloadable, weak, alias("vstp416"))) void vstore16(uint16, size_t, uint *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg416(int16 v, size_t i, __global int *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16( int16, size_t, __global  int *);
+extern __attribute__((overloadable, weak, alias("vstg416"))) void vstore16(uint16, size_t, __global uint *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl416(int16 v, size_t i, __local int *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16( int16, size_t, __local  int *);
+extern __attribute__((overloadable, weak, alias("vstl416"))) void vstore16(uint16, size_t, __local uint *);
+#endif
+
+
+__attribute__((always_inline)) static void
+vstp816(long16 v, size_t i, long *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16( long16, size_t,  long *);
+extern __attribute__((overloadable, weak, alias("vstp816"))) void vstore16(ulong16, size_t, ulong *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstg816(long16 v, size_t i, __global long *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16( long16, size_t, __global  long *);
+extern __attribute__((overloadable, weak, alias("vstg816"))) void vstore16(ulong16, size_t, __global ulong *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+__attribute__((always_inline)) static void
+vstl816(long16 v, size_t i, __local long *p)
+{
+    p += i * 16;
+    p[0] = v.s0;
+    p[1] = v.s1;
+    p[2] = v.s2;
+    p[3] = v.s3;
+    p[4] = v.s4;
+    p[5] = v.s5;
+    p[6] = v.s6;
+    p[7] = v.s7;
+    p[8] = v.s8;
+    p[9] = v.s9;
+    p[10] = v.sa;
+    p[11] = v.sb;
+    p[12] = v.sc;
+    p[13] = v.sd;
+    p[14] = v.se;
+    p[15] = v.sf;
+
+}
+extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16( long16, size_t, __local  long *);
+extern __attribute__((overloadable, weak, alias("vstl816"))) void vstore16(ulong16, size_t, __local ulong *);
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/vldst/vldst_half.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,4237 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhp(size_t i, const half *p)
+{
+    ushort h = *(const short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhp"))) float  vload_half(size_t, const half *);
+extern __attribute__((overloadable, weak, alias("vldhp"))) float vloada_half(size_t, const half *);
+
+
+
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhc(size_t i, const __constant half *p)
+{
+    ushort h = *(const __constant short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhc"))) float  vload_half(size_t, const __constant half *);
+extern __attribute__((overloadable, weak, alias("vldhc"))) float vloada_half(size_t, const __constant half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhg(size_t i, const __global half *p)
+{
+    ushort h = *(const __global short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhg"))) float  vload_half(size_t, const __global half *);
+extern __attribute__((overloadable, weak, alias("vldhg"))) float vloada_half(size_t, const __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float __cvt_f16_to_f32(ushort);
+
+__attribute__((always_inline)) static float
+vldhl(size_t i, const __local half *p)
+{
+    ushort h = *(const __local short *)(p + i);
+    return __cvt_f16_to_f32(h);
+}
+extern __attribute__((overloadable, weak, alias("vldhl"))) float  vload_half(size_t, const __local half *);
+extern __attribute__((overloadable, weak, alias("vldhl"))) float vloada_half(size_t, const __local half *);
+#endif
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __constant half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __global half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vload_half2(size_t i, const __local half *p)
+{
+    return __cvt_2f16_to_2f32(vload2(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __constant half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __global half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vload_half3(size_t i, const __local half *p)
+{
+    return __cvt_3f16_to_3f32(vload3(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __constant half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __global half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vload_half4(size_t i, const __local half *p)
+{
+    return __cvt_4f16_to_4f32(vload4(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __constant half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __global half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vload_half8(size_t i, const __local half *p)
+{
+    return __cvt_8f16_to_8f32(vload8(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const ushort *)p));
+}
+
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __constant half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const __constant ushort *)p));
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __global half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const __global ushort *)p));
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vload_half16(size_t i, const __local half *p)
+{
+    return __cvt_16f16_to_16f32(vload16(i, (const __local ushort *)p));
+}
+#endif
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const ushort2 *)(p + i * 2));
+
+}
+
+
+
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __constant half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const __constant ushort2 *)(p + i * 2));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __global half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const __global ushort2 *)(p + i * 2));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float2 __cvt_2f16_to_2f32(ushort2);
+
+__attribute__((overloadable, always_inline, weak)) float2
+vloada_half2(size_t i, const __local half *p)
+{
+
+    return __cvt_2f16_to_2f32(*(const __local ushort2 *)(p + i * 2));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const half *p)
+{
+
+    ushort4 h = *(const ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+
+
+
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __constant half *p)
+{
+
+    ushort4 h = *(const __constant ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __global half *p)
+{
+
+    ushort4 h = *(const __global ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float3 __cvt_3f16_to_3f32(ushort3);
+
+__attribute__((overloadable, always_inline, weak)) float3
+vloada_half3(size_t i, const __local half *p)
+{
+
+    ushort4 h = *(const __local ushort4 *)(p + i * 4);
+    return __cvt_3f16_to_3f32(h.s012);
+
+}
+#endif
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const ushort4 *)(p + i * 4));
+
+}
+
+
+
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __constant half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const __constant ushort4 *)(p + i * 4));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __global half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const __global ushort4 *)(p + i * 4));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float4 __cvt_4f16_to_4f32(ushort4);
+
+__attribute__((overloadable, always_inline, weak)) float4
+vloada_half4(size_t i, const __local half *p)
+{
+
+    return __cvt_4f16_to_4f32(*(const __local ushort4 *)(p + i * 4));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const ushort8 *)(p + i * 8));
+
+}
+
+
+
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __constant half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const __constant ushort8 *)(p + i * 8));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __global half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const __global ushort8 *)(p + i * 8));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float8 __cvt_8f16_to_8f32(ushort8);
+
+__attribute__((overloadable, always_inline, weak)) float8
+vloada_half8(size_t i, const __local half *p)
+{
+
+    return __cvt_8f16_to_8f32(*(const __local ushort8 *)(p + i * 8));
+
+}
+#endif
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const ushort16 *)(p + i * 16));
+
+}
+
+
+
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __constant half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const __constant ushort16 *)(p + i * 16));
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __global half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const __global ushort16 *)(p + i * 16));
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) float16 __cvt_16f16_to_16f32(ushort16);
+
+__attribute__((overloadable, always_inline, weak)) float16
+vloada_half16(size_t i, const __local half *p)
+{
+
+    return __cvt_16f16_to_16f32(*(const __local ushort16 *)(p + i * 16));
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthpf32c(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32c")))  void vstore_half(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32c"))) void vstorea_half(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthgf32c(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32c")))  void vstore_half(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32c"))) void vstorea_half(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_cur(float);
+
+__attribute__((always_inline)) static void
+vsthlf32c(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32c")))  void vstore_half(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32c"))) void vstorea_half(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthpf32e(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32e")))  void vstore_half_rte(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32e"))) void vstorea_half_rte(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthgf32e(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32e")))  void vstore_half_rte(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32e"))) void vstorea_half_rte(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rte(float);
+
+__attribute__((always_inline)) static void
+vsthlf32e(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32e")))  void vstore_half_rte(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32e"))) void vstorea_half_rte(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthpf32p(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32p")))  void vstore_half_rtp(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32p"))) void vstorea_half_rtp(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthgf32p(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32p")))  void vstore_half_rtp(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32p"))) void vstorea_half_rtp(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtp(float);
+
+__attribute__((always_inline)) static void
+vsthlf32p(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32p")))  void vstore_half_rtp(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32p"))) void vstorea_half_rtp(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthpf32n(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32n")))  void vstore_half_rtn(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32n"))) void vstorea_half_rtn(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthgf32n(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32n")))  void vstore_half_rtn(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32n"))) void vstorea_half_rtn(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtn(float);
+
+__attribute__((always_inline)) static void
+vsthlf32n(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32n")))  void vstore_half_rtn(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32n"))) void vstorea_half_rtn(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthpf32z(float v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf32z")))  void vstore_half_rtz(float, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf32z"))) void vstorea_half_rtz(float, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthgf32z(float v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf32z")))  void vstore_half_rtz(float, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf32z"))) void vstorea_half_rtz(float, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f32_to_f16_rtz(float);
+
+__attribute__((always_inline)) static void
+vsthlf32z(float v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f32_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf32z")))  void vstore_half_rtz(float, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf32z"))) void vstorea_half_rtz(float, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthpf64c(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64c")))  void vstore_half(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64c"))) void vstorea_half(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthgf64c(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64c")))  void vstore_half(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64c"))) void vstorea_half(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_cur(double);
+
+__attribute__((always_inline)) static void
+vsthlf64c(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_cur(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64c")))  void vstore_half(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64c"))) void vstorea_half(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthpf64e(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64e")))  void vstore_half_rte(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64e"))) void vstorea_half_rte(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthgf64e(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64e")))  void vstore_half_rte(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64e"))) void vstorea_half_rte(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rte(double);
+
+__attribute__((always_inline)) static void
+vsthlf64e(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rte(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64e")))  void vstore_half_rte(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64e"))) void vstorea_half_rte(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthpf64p(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64p")))  void vstore_half_rtp(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64p"))) void vstorea_half_rtp(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthgf64p(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64p")))  void vstore_half_rtp(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64p"))) void vstorea_half_rtp(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtp(double);
+
+__attribute__((always_inline)) static void
+vsthlf64p(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtp(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64p")))  void vstore_half_rtp(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64p"))) void vstorea_half_rtp(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthpf64n(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64n")))  void vstore_half_rtn(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64n"))) void vstorea_half_rtn(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthgf64n(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64n")))  void vstore_half_rtn(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64n"))) void vstorea_half_rtn(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtn(double);
+
+__attribute__((always_inline)) static void
+vsthlf64n(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtn(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64n")))  void vstore_half_rtn(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64n"))) void vstorea_half_rtn(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthpf64z(double v, size_t i, half *p)
+{
+    *(ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthpf64z")))  void vstore_half_rtz(double, size_t, half *);
+extern __attribute__((overloadable, weak, alias("vsthpf64z"))) void vstorea_half_rtz(double, size_t, half *);
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthgf64z(double v, size_t i, __global half *p)
+{
+    *(__global ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthgf64z")))  void vstore_half_rtz(double, size_t, __global half *);
+extern __attribute__((overloadable, weak, alias("vsthgf64z"))) void vstorea_half_rtz(double, size_t, __global half *);
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort __cvt_f64_to_f16_rtz(double);
+
+__attribute__((always_inline)) static void
+vsthlf64z(double v, size_t i, __local half *p)
+{
+    *(__local ushort *)(p + i) = __cvt_f64_to_f16_rtz(v);
+}
+extern __attribute__((overloadable, weak, alias("vsthlf64z")))  void vstore_half_rtz(double, size_t, __local half *);
+extern __attribute__((overloadable, weak, alias("vsthlf64z"))) void vstorea_half_rtz(double, size_t, __local half *);
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(float2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f32_to_2f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(float3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f32_to_3f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(float4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f32_to_4f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(float8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f32_to_8f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(float16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f32_to_16f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rte(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtp(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtn(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, __global half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half2_rtz(double2 v, size_t i, __local half *p)
+{
+    vstore2(__cvt_2f64_to_2f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rte(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtp(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtn(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, __global half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half3_rtz(double3 v, size_t i, __local half *p)
+{
+    vstore3(__cvt_3f64_to_3f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rte(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtp(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtn(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, __global half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half4_rtz(double4 v, size_t i, __local half *p)
+{
+    vstore4(__cvt_4f64_to_4f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rte(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtp(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtn(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, __global half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half8_rtz(double8 v, size_t i, __local half *p)
+{
+    vstore8(__cvt_8f64_to_8f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_cur(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_cur(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_cur(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rte(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rte(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rte(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rte(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtp(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtp(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtp(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtn(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtn(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtn(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtz(v), i, (ushort *)p);
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, __global half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__global ushort *)p);
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstore_half16_rtz(double16 v, size_t i, __local half *p)
+{
+    vstore16(__cvt_16f64_to_16f16_rtz(v), i, (__local ushort *)p);
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_cur(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rte(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtp(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtn(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f32_to_2f16_rtz(float2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(float2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f32_to_2f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_cur(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_cur(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_cur(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_cur(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rte(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rte(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rte(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rte(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtp(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtp(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtp(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtp(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtn(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtn(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtn(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtn(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtz(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtz(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f32_to_3f16_rtz(float3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(float3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f32_to_3f16_rtz(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_cur(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rte(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtp(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtn(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f32_to_4f16_rtz(float4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(float4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f32_to_4f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_cur(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rte(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtp(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtn(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f32_to_8f16_rtz(float8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(float8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f32_to_8f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_cur(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rte(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtp(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtn(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f32_to_16f16_rtz(float16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(float16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f32_to_16f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_cur(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rte(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rte(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtp(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtp(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtn(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtn(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, half *p)
+{
+
+    *(ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, __global half *p)
+{
+
+    *(__global ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort2 __cvt_2f64_to_2f16_rtz(double2);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half2_rtz(double2 v, size_t i, __local half *p)
+{
+
+    *(__local ushort2 *)(p + i * 2) = __cvt_2f64_to_2f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_cur(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_cur(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_cur(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_cur(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rte(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rte(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rte(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rte(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rte(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtp(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtp(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtp(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtp(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtp(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtn(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtn(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtn(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtn(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtn(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtz(v);
+    *(ushort4 *)(p + i * 4) = h;
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, __global half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtz(v);
+    *(__global ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort3 __cvt_3f64_to_3f16_rtz(double3);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half3_rtz(double3 v, size_t i, __local half *p)
+{
+
+    ushort4 h;
+    h.s012 = __cvt_3f64_to_3f16_rtz(v);
+    *(__local ushort4 *)(p + i * 4) = h;
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_cur(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rte(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rte(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtp(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtp(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtn(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtn(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, half *p)
+{
+
+    *(ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, __global half *p)
+{
+
+    *(__global ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort4 __cvt_4f64_to_4f16_rtz(double4);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half4_rtz(double4 v, size_t i, __local half *p)
+{
+
+    *(__local ushort4 *)(p + i * 4) = __cvt_4f64_to_4f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_cur(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rte(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rte(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtp(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtp(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtn(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtn(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, half *p)
+{
+
+    *(ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, __global half *p)
+{
+
+    *(__global ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort8 __cvt_8f64_to_8f16_rtz(double8);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half8_rtz(double8 v, size_t i, __local half *p)
+{
+
+    *(__local ushort8 *)(p + i * 8) = __cvt_8f64_to_8f16_rtz(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_cur(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_cur(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rte(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rte(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rte(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtp(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtp(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtp(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtn(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtn(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtn(v);
+
+}
+#endif
+
+
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, half *p)
+{
+
+    *(ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, __global half *p)
+{
+
+    *(__global ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+#endif
+
+#if __OPENCL_C_VERSION__ < 200
+extern __attribute__((pure)) ushort16 __cvt_16f64_to_16f16_rtz(double16);
+
+__attribute__((overloadable, always_inline, weak)) void
+vstorea_half16_rtz(double16 v, size_t i, __local half *p)
+{
+
+    *(__local ushort16 *)(p + i * 16) = __cvt_16f64_to_16f16_rtz(v);
+
+}
+#endif

Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wg.h Tue Oct  7 12:10:46 2014
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+// XXX The runtime computes CL_DEVICE_MAX_WORK_GROUP_SIZE as
+// XXX dev->wave_front_size * dev->max_waves_per_simd
+// XXX If max_waves_per_simd is ever raised then this code will need to be updated
+#define MAX_WAVES_PER_SIMD  4
+
+#pragma OPENCL EXTENSION cl_amd_program_scope_locals : enable
+extern __local ulong __wg_scratch[MAX_WAVES_PER_SIMD];
+

Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wganyall.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GEN_AA(SUF,ID) \
+__attribute__((overloadable, always_inline)) int \
+work_group_##SUF(int predicate) \
+{ \
+    uint n = get_num_sub_groups(); \
+    int a = sub_group_##SUF(predicate); \
+    if (n == 1) \
+	return a; \
+ \
+    __local int *p = (__local int *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == 0) \
+	p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	a = l < n ? p[l] : ID; \
+	a = sub_group_##SUF(a); \
+	if (l == 0) \
+	    p[0] = a; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = p[0]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+ \
+    return a; \
+}
+
+GEN_AA(all, 1U)
+GEN_AA(any, 0U);
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgbarrier.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+extern void __hsail_barrier(void);
+
+__attribute__((overloadable, weak, always_inline)) void
+work_group_barrier(cl_mem_fence_flags flags, memory_scope scope)
+{
+    atomic_work_item_fence(flags, memory_order_release, scope);
+    __hsail_barrier();
+    atomic_work_item_fence(flags, memory_order_acquire, scope);
+}
+
+__attribute__((overloadable, weak, always_inline)) void
+work_group_barrier(cl_mem_fence_flags flags)
+{
+    work_group_barrier(flags, memory_scope_work_group);
+}
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgbcast.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,75 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GEN_BROADCAST(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x) \
+{ \
+    if (get_num_sub_groups() == 1) \
+        return sub_group_broadcast(a, local_id_x); \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    if (get_local_id(0) == local_id_x) \
+        *p = a; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = *p; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+} \
+\
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y) \
+{ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y) \
+        *p = a; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = *p; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+} \
+\
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_broadcast(TYPE a, size_t local_id_x, size_t local_id_y, size_t local_id_z) \
+{ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    if (get_local_id(0) == local_id_x && get_local_id(1) == local_id_y && get_local_id(2) == local_id_z) \
+        *p = a; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = *p; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+}
+
+GEN_BROADCAST(uint)
+GEN_BROADCAST(int)
+GEN_BROADCAST(ulong)
+GEN_BROADCAST(long)
+GEN_BROADCAST(float)
+GEN_BROADCAST(double)
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgreduce.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#include "wg.h"
+
+#define GENA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_reduce_add(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_reduce_add(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == 0) \
+        p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	a = l < n ? p[l] : (TYPE)0; \
+	a = sub_group_reduce_add(a); \
+	if (l == 0) \
+	    p[0] = a; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = p[0]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+}
+
+#define GENO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_reduce_##SUF(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_reduce_##SUF(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == 0) \
+        p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	a = l < n ? p[l] : ID; \
+	a = sub_group_reduce_##SUF(a); \
+	if (l == 0) \
+	    p[0] = a; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    a = p[0]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return a; \
+}
+
+GENA(int)
+GENA(uint)
+GENA(long)
+GENA(ulong)
+GENA(float)
+GENA(double)
+
+GENO(int,max,INT_MIN)
+GENO(uint,max,0U)
+GENO(long,max,LONG_MIN)
+GENO(ulong,max,0UL)
+GENO(float,max,-INFINITY)
+GENO(double,max,-(double)INFINITY)
+
+GENO(int,min,INT_MAX)
+GENO(uint,min,UINT_MAX)
+GENO(long,min,LONG_MAX)
+GENO(ulong,min,ULONG_MAX)
+GENO(float,min,INFINITY)
+GENO(double,min,(double)INFINITY)
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgscan.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,186 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#include "wg.h"
+
+#if __OPENCL_C_VERSION__ >= 200
+
+#define GENIA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_inclusive_add(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_scan_inclusive_add(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE t = l < n ? p[l] : (TYPE)0; \
+	t = sub_group_scan_inclusive_add(t); \
+	if (l < n) \
+	    p[l] = t; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? a : a + p[i-1]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+#define GENIO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_inclusive_##SUF(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    a = sub_group_scan_inclusive_##SUF(a); \
+    if (n == 1) \
+        return a; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = a; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE t = l < n ? p[l] : ID; \
+	t = sub_group_scan_inclusive_##SUF(t); \
+	if (l < n) \
+	    p[l] = t; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? a : SUF(a, p[i-1]); \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+GENIA(int)
+GENIA(uint)
+GENIA(long)
+GENIA(ulong)
+GENIA(float)
+GENIA(double)
+
+GENIO(int,max,INT_MIN)
+GENIO(uint,max,0U)
+GENIO(long,max,LONG_MIN)
+GENIO(ulong,max,0UL)
+GENIO(float,max,-INFINITY)
+GENIO(double,max,-(double)INFINITY)
+
+GENIO(int,min,INT_MAX)
+GENIO(uint,min,UINT_MAX)
+GENIO(long,min,LONG_MAX)
+GENIO(ulong,min,ULONG_MAX)
+GENIO(float,min,INFINITY)
+GENIO(double,min,(double)INFINITY)
+
+#define GENEA(TYPE) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_exclusive_add(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    TYPE t = sub_group_scan_exclusive_add(a); \
+    if (n == 1) \
+        return t; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = a + t; \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE s = l < n ? p[l] : (TYPE)0; \
+	s = sub_group_scan_inclusive_add(s); \
+	if (l < n) \
+	    p[l] = s; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? t : t + p[i-1]; \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+#define GENEO(TYPE,SUF,ID) \
+__attribute__((overloadable,weak,always_inline)) TYPE \
+work_group_scan_exclusive_##SUF(TYPE a) \
+{ \
+    uint n = get_num_sub_groups(); \
+    TYPE t = sub_group_scan_exclusive_##SUF(a); \
+    if (n == 1) \
+        return t; \
+ \
+    __local TYPE *p = (__local TYPE *)__wg_scratch; \
+    uint l = get_sub_group_local_id(); \
+    uint i = get_sub_group_id(); \
+ \
+    if (l == get_sub_group_size() - 1U) \
+	p[i] = SUF(a, t); \
+ \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    if (i == 0) { \
+	TYPE s = l < n ? p[l] : ID; \
+	s = sub_group_scan_inclusive_##SUF(s); \
+	if (l < n) \
+	    p[l] = s; \
+    } \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    TYPE ret = i == 0 ? t : SUF(t, p[i-1]); \
+    work_group_barrier(CLK_LOCAL_MEM_FENCE); \
+    return ret; \
+}
+
+GENEA(int)
+GENEA(uint)
+GENEA(long)
+GENEA(ulong)
+GENEA(float)
+GENEA(double)
+
+GENEO(int,max,INT_MIN)
+GENEO(uint,max,0U)
+GENEO(long,max,LONG_MIN)
+GENEO(ulong,max,0UL)
+GENEO(float,max,-INFINITY)
+GENEO(double,max,-(double)INFINITY)
+
+GENEO(int,min,INT_MAX)
+GENEO(uint,min,UINT_MAX)
+GENEO(long,min,LONG_MAX)
+GENEO(ulong,min,ULONG_MAX)
+GENEO(float,min,INFINITY)
+GENEO(double,min,(double)INFINITY)
+
+#endif
+

Added: libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl
URL: http://llvm.org/viewvc/llvm-project/libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl?rev=219217&view=auto
==============================================================================
--- libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl (added)
+++ libclc/branches/amd-builtins/amd-builtins/workgroup/wgscratch.cl Tue Oct  7 12:10:46 2014
@@ -0,0 +1,30 @@
+/*
+ * Copyright (c) 2014 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+
+#if __OPENCL_C_VERSION >= 200
+
+#include "wg.h"
+
+// Temporary data for work group functions
+__local ulong __wg_scratch[MAX_WAVES_PER_SIMD];
+
+#endif