[libclc] [libclc] Move several integer functions to CLC library (PR #116786)

Tue Nov 19 03:25:52 PST 2024

https://github.com/frasercrmck created https://github.com/llvm/llvm-project/pull/116786

This commit moves over the OpenCL clz, hadd, mad24, mad_hi, mul24, mul_hi, popcount, rhadd, and upsample builtins to the CLC library. There are no changes to any target's CLC libraries.

The OpenCL mad_hi builtin wasn't previously publicly available from the CLC libraries, as it was hash-defined to mul_hi in the header files. That issue has been fixed, and mad_hi is now exposed.

The custom AMD implementation/workaround for popcount has been removed as it was only valid for clang < 7.

There are still three integer functions which haven't been moved over. The OpenCL add_sat, sub_sat, and mad_sat builtins require saturating conversion builtins which haven't yet been ported.

>From 3f05aee5651a8364d4f3ba45bfc8024ff5beec8c Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Mon, 18 Nov 2024 17:29:42 +0000
Subject: [PATCH] [libclc] Move several integer functions to CLC library

This commit moves over the OpenCL clz, hadd, mad24, mad_hi, mul24,
mul_hi, popcount, rhadd, and upsample builtins to the CLC library. There
are no changes to any target's CLC libraries.

The OpenCL mad_hi builtin wasn't previously publicly available from the
CLC libraries, as it was hash-defined to mul_hi in the header files.
That issue has been fixed, and mad_hi is now exposed.

The custom AMD implementation/workaround for popcount has been removed
as it was only valid for clang < 7.

There are still two integer functions which haven't been moved over. The
OpenCL add_sat, sub_sat, and mad_sat builtins require saturating
conversion builtins which haven't yet been ported.
---
 libclc/amdgcn/lib/SOURCES                     |   1 -
 libclc/amdgcn/lib/integer/popcount.cl         |   6 -
 libclc/amdgcn/lib/integer/popcount.inc        |  17 ---
 libclc/clc/include/clc/integer/binary_decl.h  |   2 +
 libclc/clc/include/clc/integer/clc_clz.h      |  21 +++
 libclc/clc/include/clc/integer/clc_hadd.h     |  21 +++
 libclc/clc/include/clc/integer/clc_mad24.h    |  21 +++
 libclc/clc/include/clc/integer/clc_mad_hi.h   |   8 ++
 libclc/clc/include/clc/integer/clc_mul24.h    |  21 +++
 libclc/clc/include/clc/integer/clc_mul_hi.h   |  21 +++
 libclc/clc/include/clc/integer/clc_popcount.h |  19 +++
 libclc/clc/include/clc/integer/clc_rhadd.h    |  21 +++
 libclc/clc/include/clc/integer/clc_upsample.h |  38 +++++
 .../include/clc/integer/definitions.h         |   7 +-
 libclc/clc/include/clc/integer/gentype24.inc  | 134 ++++++++++++++++++
 libclc/clc/include/clc/integer/ternary_decl.h |   2 +
 libclc/clc/include/clc/integer/unary_decl.h   |   1 +
 .../clc/include/clc/integer/unary_intrin.inc  |  26 ++++
 libclc/clc/lib/generic/SOURCES                |   7 +
 libclc/clc/lib/generic/integer/clc_clz.cl     |  44 ++++++
 libclc/clc/lib/generic/integer/clc_hadd.cl    |   4 +
 libclc/clc/lib/generic/integer/clc_hadd.inc   |   8 ++
 libclc/clc/lib/generic/integer/clc_mad24.cl   |   5 +
 libclc/clc/lib/generic/integer/clc_mad24.inc  |   5 +
 libclc/clc/lib/generic/integer/clc_mul24.cl   |   4 +
 .../lib/generic/integer/clc_mul24.inc}        |   4 +-
 libclc/clc/lib/generic/integer/clc_mul_hi.cl  | 113 +++++++++++++++
 libclc/clc/lib/generic/integer/clc_rhadd.cl   |   4 +
 libclc/clc/lib/generic/integer/clc_rhadd.inc  |   8 ++
 .../clc/lib/generic/integer/clc_upsample.cl   |  45 ++++++
 libclc/generic/include/clc/integer/clz.h      |   6 +-
 libclc/generic/include/clc/integer/clz.inc    |   1 -
 libclc/generic/include/clc/integer/hadd.h     |   6 +-
 libclc/generic/include/clc/integer/hadd.inc   |   1 -
 libclc/generic/include/clc/integer/mad24.h    |   9 +-
 libclc/generic/include/clc/integer/mad24.inc  |   1 -
 libclc/generic/include/clc/integer/mad_hi.h   |   7 +-
 libclc/generic/include/clc/integer/mul24.h    |   9 +-
 libclc/generic/include/clc/integer/mul24.inc  |   1 -
 libclc/generic/include/clc/integer/mul_hi.h   |   6 +-
 libclc/generic/include/clc/integer/mul_hi.inc |   1 -
 libclc/generic/include/clc/integer/popcount.h |   9 +-
 libclc/generic/include/clc/integer/rhadd.h    |   6 +-
 libclc/generic/include/clc/integer/rhadd.inc  |   1 -
 libclc/generic/include/clc/integer/upsample.h |  33 +++--
 libclc/generic/include/integer/popcount.h     |   3 -
 .../generic/include/integer/unary_intrin.inc  |  20 ---
 libclc/generic/lib/SOURCES                    |   1 +
 libclc/generic/lib/integer/binary_def.inc     |   8 ++
 libclc/generic/lib/integer/clz.cl             |  44 +-----
 libclc/generic/lib/integer/hadd.cl            |   5 +-
 libclc/generic/lib/integer/hadd.inc           |   6 -
 libclc/generic/lib/integer/mad24.cl           |   7 +-
 libclc/generic/lib/integer/mad24.inc          |   3 -
 libclc/generic/lib/integer/mad_hi.cl          |   7 +
 libclc/generic/lib/integer/mul24.cl           |   7 +-
 libclc/generic/lib/integer/mul_hi.cl          | 110 +-------------
 libclc/generic/lib/integer/popcount.cl        |   7 +-
 libclc/generic/lib/integer/rhadd.cl           |   5 +-
 libclc/generic/lib/integer/rhadd.inc          |   6 -
 libclc/generic/lib/integer/ternary_def.inc    |   8 ++
 libclc/generic/lib/integer/unary_def.inc      |   7 +
 libclc/generic/lib/integer/upsample.cl        |  54 +++----
 libclc/generic/lib/math/clc_fma.cl            |   3 +-
 libclc/generic/lib/math/clc_fmod.cl           |   5 +-
 libclc/generic/lib/math/clc_remainder.cl      |   5 +-
 libclc/generic/lib/math/clc_remquo.cl         |   5 +-
 libclc/generic/lib/math/sincos_helpers.cl     |  20 +--
 68 files changed, 780 insertions(+), 301 deletions(-)
 delete mode 100644 libclc/amdgcn/lib/integer/popcount.cl
 delete mode 100644 libclc/amdgcn/lib/integer/popcount.inc
 create mode 100644 libclc/clc/include/clc/integer/binary_decl.h
 create mode 100644 libclc/clc/include/clc/integer/clc_clz.h
 create mode 100644 libclc/clc/include/clc/integer/clc_hadd.h
 create mode 100644 libclc/clc/include/clc/integer/clc_mad24.h
 create mode 100644 libclc/clc/include/clc/integer/clc_mad_hi.h
 create mode 100644 libclc/clc/include/clc/integer/clc_mul24.h
 create mode 100644 libclc/clc/include/clc/integer/clc_mul_hi.h
 create mode 100644 libclc/clc/include/clc/integer/clc_popcount.h
 create mode 100644 libclc/clc/include/clc/integer/clc_rhadd.h
 create mode 100644 libclc/clc/include/clc/integer/clc_upsample.h
 rename libclc/{generic => clc}/include/clc/integer/definitions.h (71%)
 create mode 100644 libclc/clc/include/clc/integer/gentype24.inc
 create mode 100644 libclc/clc/include/clc/integer/ternary_decl.h
 create mode 100644 libclc/clc/include/clc/integer/unary_decl.h
 create mode 100644 libclc/clc/include/clc/integer/unary_intrin.inc
 create mode 100644 libclc/clc/lib/generic/integer/clc_clz.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_hadd.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_hadd.inc
 create mode 100644 libclc/clc/lib/generic/integer/clc_mad24.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_mad24.inc
 create mode 100644 libclc/clc/lib/generic/integer/clc_mul24.cl
 rename libclc/{generic/lib/integer/mul24.inc => clc/lib/generic/integer/clc_mul24.inc} (68%)
 create mode 100644 libclc/clc/lib/generic/integer/clc_mul_hi.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_rhadd.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_rhadd.inc
 create mode 100644 libclc/clc/lib/generic/integer/clc_upsample.cl
 delete mode 100644 libclc/generic/include/clc/integer/clz.inc
 delete mode 100644 libclc/generic/include/clc/integer/hadd.inc
 delete mode 100644 libclc/generic/include/clc/integer/mad24.inc
 delete mode 100644 libclc/generic/include/clc/integer/mul24.inc
 delete mode 100644 libclc/generic/include/clc/integer/mul_hi.inc
 delete mode 100644 libclc/generic/include/clc/integer/rhadd.inc
 delete mode 100644 libclc/generic/include/integer/popcount.h
 delete mode 100644 libclc/generic/include/integer/unary_intrin.inc
 create mode 100644 libclc/generic/lib/integer/binary_def.inc
 delete mode 100644 libclc/generic/lib/integer/hadd.inc
 delete mode 100644 libclc/generic/lib/integer/mad24.inc
 create mode 100644 libclc/generic/lib/integer/mad_hi.cl
 delete mode 100644 libclc/generic/lib/integer/rhadd.inc
 create mode 100644 libclc/generic/lib/integer/ternary_def.inc
 create mode 100644 libclc/generic/lib/integer/unary_def.inc

diff --git a/libclc/amdgcn/lib/SOURCES b/libclc/amdgcn/lib/SOURCES
index b235457f9ab7c3..4ea66385fe50ee 100644
--- a/libclc/amdgcn/lib/SOURCES
+++ b/libclc/amdgcn/lib/SOURCES
@@ -1,5 +1,4 @@
 cl_khr_int64_extended_atomics/minmax_helpers.ll
-integer/popcount.cl
 math/fmax.cl
 math/fmin.cl
 math/ldexp.cl
diff --git a/libclc/amdgcn/lib/integer/popcount.cl b/libclc/amdgcn/lib/integer/popcount.cl
deleted file mode 100644
index 3b493fbd146f01..00000000000000
--- a/libclc/amdgcn/lib/integer/popcount.cl
+++ /dev/null
@@ -1,6 +0,0 @@
-#include <clc/clc.h>
-#include <clc/utils.h>
-#include <integer/popcount.h>
-
-#define __CLC_BODY "popcount.inc"
-#include <clc/integer/gentype.inc>
diff --git a/libclc/amdgcn/lib/integer/popcount.inc b/libclc/amdgcn/lib/integer/popcount.inc
deleted file mode 100644
index 402ddb768c6a6f..00000000000000
--- a/libclc/amdgcn/lib/integer/popcount.inc
+++ /dev/null
@@ -1,17 +0,0 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE popcount(__CLC_GENTYPE x) {
-/* LLVM-4+ implements i16 ops for VI+ ASICs. However, ctpop implementation
- * is missing until r326535. Therefore we have to convert sub i32 types to uint
- * as a workaround. */
-#if __clang_major__ < 7 && __clang_major__ > 3 && __CLC_GENSIZE < 32
-	/* Prevent sign extension on uint conversion */
-	const __CLC_U_GENTYPE y = __CLC_XCONCAT(as_, __CLC_U_GENTYPE)(x);
-	/* Convert to uintX */
-	const __CLC_XCONCAT(uint, __CLC_VECSIZE) z = __CLC_XCONCAT(convert_uint, __CLC_VECSIZE)(y);
-	/* Call popcount on uintX type */
-	const __CLC_XCONCAT(uint, __CLC_VECSIZE) res = __clc_native_popcount(z);
-	/* Convert the result back to gentype. */
-	return __CLC_XCONCAT(convert_, __CLC_GENTYPE)(res);
-#else
-	return __clc_native_popcount(x);
-#endif
-}
diff --git a/libclc/clc/include/clc/integer/binary_decl.h b/libclc/clc/include/clc/integer/binary_decl.h
new file mode 100644
index 00000000000000..b54f36ba9b6c3c
--- /dev/null
+++ b/libclc/clc/include/clc/integer/binary_decl.h
@@ -0,0 +1,2 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x,
+                                               __CLC_GENTYPE y);
diff --git a/libclc/clc/include/clc/integer/clc_clz.h b/libclc/clc/include/clc/integer/clc_clz.h
new file mode 100644
index 00000000000000..1e2d23084bf7e3
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_clz.h
@@ -0,0 +1,21 @@
+#ifndef __CLC_INTEGER_CLC_CLZ_H__
+#define __CLC_INTEGER_CLC_CLZ_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible clz
+#define __clc_clz clz
+#else
+
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
+#define FUNCTION __clc_clz
+#define __CLC_BODY "unary_decl.h"
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_CLZ_H__
diff --git a/libclc/clc/include/clc/integer/clc_hadd.h b/libclc/clc/include/clc/integer/clc_hadd.h
new file mode 100644
index 00000000000000..7eb91ae45a8085
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_hadd.h
@@ -0,0 +1,21 @@
+#ifndef __CLC_INTEGER_CLC_HADD_H__
+#define __CLC_INTEGER_CLC_HADD_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible hadd
+#define __clc_hadd hadd
+#else
+
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
+#define FUNCTION __clc_hadd
+#define __CLC_BODY "binary_decl.h"
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_HADD_H__
diff --git a/libclc/clc/include/clc/integer/clc_mad24.h b/libclc/clc/include/clc/integer/clc_mad24.h
new file mode 100644
index 00000000000000..354b019e86688c
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_mad24.h
@@ -0,0 +1,21 @@
+#ifndef __CLC_INTEGER_CLC_MAD24_H__
+#define __CLC_INTEGER_CLC_MAD24_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible mad24
+#define __clc_mad24 mad24
+#else
+
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
+#define FUNCTION __clc_mad24
+#define __CLC_BODY "ternary_decl.h"
+
+#include <clc/integer/gentype24.inc>
+
+#undef FUNCTION
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_MAD24_H__
diff --git a/libclc/clc/include/clc/integer/clc_mad_hi.h b/libclc/clc/include/clc/integer/clc_mad_hi.h
new file mode 100644
index 00000000000000..24a590df6027a8
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_mad_hi.h
@@ -0,0 +1,8 @@
+#ifndef __CLC_INTEGER_CLC_MAD_HI_H__
+#define __CLC_INTEGER_CLC_MAD_HI_H__
+
+#include <clc/integer/clc_mul_hi.h>
+
+#define __clc_mad_hi(a, b, c) (__clc_mul_hi((a), (b)) + (c))
+
+#endif // __CLC_INTEGER_CLC_MAD_HI_H__
diff --git a/libclc/clc/include/clc/integer/clc_mul24.h b/libclc/clc/include/clc/integer/clc_mul24.h
new file mode 100644
index 00000000000000..3355a97affea3c
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_mul24.h
@@ -0,0 +1,21 @@
+#ifndef __CLC_INTEGER_CLC_MUL24_H__
+#define __CLC_INTEGER_CLC_MUL24_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible mul24
+#define __clc_mul24 mul24
+#else
+
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
+#define FUNCTION __clc_mul24
+#define __CLC_BODY "binary_decl.h"
+
+#include <clc/integer/gentype24.inc>
+
+#undef FUNCTION
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_MUL24_H__
diff --git a/libclc/clc/include/clc/integer/clc_mul_hi.h b/libclc/clc/include/clc/integer/clc_mul_hi.h
new file mode 100644
index 00000000000000..65b5dce04ab9fb
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_mul_hi.h
@@ -0,0 +1,21 @@
+#ifndef __CLC_INTEGER_CLC_MUL_HI_H__
+#define __CLC_INTEGER_CLC_MUL_HI_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible mul_hi
+#define __clc_mul_hi mul_hi
+#else
+
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
+#define FUNCTION __clc_mul_hi
+#define __CLC_BODY "binary_decl.h"
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_MUL_HI_H__
diff --git a/libclc/clc/include/clc/integer/clc_popcount.h b/libclc/clc/include/clc/integer/clc_popcount.h
new file mode 100644
index 00000000000000..7e785a5c1ebe7e
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_popcount.h
@@ -0,0 +1,19 @@
+#ifndef __CLC_INTEGER_CLC_POPCOUNT_H__
+#define __CLC_INTEGER_CLC_POPCOUNT_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible popcount
+#define __clc_popcount popcount
+#else
+
+// Map the function to an LLVM intrinsic
+#define __CLC_FUNCTION __clc_popcount
+#define __CLC_INTRINSIC "llvm.ctpop"
+#include <clc/integer/unary_intrin.inc>
+
+#undef __CLC_INTRINSIC
+#undef __CLC_FUNCTION
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_POPCOUNT_H__
diff --git a/libclc/clc/include/clc/integer/clc_rhadd.h b/libclc/clc/include/clc/integer/clc_rhadd.h
new file mode 100644
index 00000000000000..1fe3920a320ffb
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_rhadd.h
@@ -0,0 +1,21 @@
+#ifndef __CLC_INTEGER_CLC_RHADD_H__
+#define __CLC_INTEGER_CLC_RHADD_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible rhadd
+#define __clc_rhadd rhadd
+#else
+
+#include <clc/clcfunc.h>
+#include <clc/clctypes.h>
+
+#define FUNCTION __clc_rhadd
+#define __CLC_BODY "binary_decl.h"
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_RHADD_H__
diff --git a/libclc/clc/include/clc/integer/clc_upsample.h b/libclc/clc/include/clc/integer/clc_upsample.h
new file mode 100644
index 00000000000000..aebda96434fb41
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_upsample.h
@@ -0,0 +1,38 @@
+#ifndef __CLC_INTEGER_CLC_UPSAMPLE_H__
+#define __CLC_INTEGER_CLC_UPSAMPLE_H__
+
+#if defined(CLC_CLSPV) || defined(CLC_SPIRV)
+// clspv and spir-v targets provide their own OpenCL-compatible upsample
+#define __clc_upsample upsample
+#else
+
+#include <clc/clctypes.h>
+
+#define __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE)                       \
+  _CLC_OVERLOAD _CLC_DECL BGENTYPE __clc_upsample(GENTYPE hi, UGENTYPE lo);
+
+#define __CLC_UPSAMPLE_VEC(BGENTYPE, GENTYPE, UGENTYPE)                        \
+  __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE)                             \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16)
+
+#define __CLC_UPSAMPLE_TYPES()                                                 \
+  __CLC_UPSAMPLE_VEC(short, char, uchar)                                       \
+  __CLC_UPSAMPLE_VEC(ushort, uchar, uchar)                                     \
+  __CLC_UPSAMPLE_VEC(int, short, ushort)                                       \
+  __CLC_UPSAMPLE_VEC(uint, ushort, ushort)                                     \
+  __CLC_UPSAMPLE_VEC(long, int, uint)                                          \
+  __CLC_UPSAMPLE_VEC(ulong, uint, uint)
+
+__CLC_UPSAMPLE_TYPES()
+
+#undef __CLC_UPSAMPLE_TYPES
+#undef __CLC_UPSAMPLE_DECL
+#undef __CLC_UPSAMPLE_VEC
+
+#endif
+
+#endif // __CLC_INTEGER_CLC_UPSAMPLE_H__
diff --git a/libclc/generic/include/clc/integer/definitions.h b/libclc/clc/include/clc/integer/definitions.h
similarity index 71%
rename from libclc/generic/include/clc/integer/definitions.h
rename to libclc/clc/include/clc/integer/definitions.h
index 0079c30123db80..18a9e54dec75c6 100644
--- a/libclc/generic/include/clc/integer/definitions.h
+++ b/libclc/clc/include/clc/integer/definitions.h
@@ -1,7 +1,10 @@
+#ifndef __CLC_INTEGER_DEFINITIONS_H__
+#define __CLC_INTEGER_DEFINITIONS_H__
+
 #define CHAR_BIT 8
 #define INT_MAX 2147483647
 #define INT_MIN (-2147483647 - 1)
-#define LONG_MAX  0x7fffffffffffffffL
+#define LONG_MAX 0x7fffffffffffffffL
 #define LONG_MIN (-0x7fffffffffffffffL - 1)
 #define CHAR_MAX SCHAR_MAX
 #define CHAR_MIN SCHAR_MIN
@@ -13,3 +16,5 @@
 #define USHRT_MAX 65535
 #define UINT_MAX 0xffffffff
 #define ULONG_MAX 0xffffffffffffffffUL
+
+#endif // __CLC_INTEGER_DEFINITIONS_H__
diff --git a/libclc/clc/include/clc/integer/gentype24.inc b/libclc/clc/include/clc/integer/gentype24.inc
new file mode 100644
index 00000000000000..12859029312405
--- /dev/null
+++ b/libclc/clc/include/clc/integer/gentype24.inc
@@ -0,0 +1,134 @@
+#define __CLC_GENSIZE 32
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE int
+
+#define __CLC_GENTYPE int
+#define __CLC_U_GENTYPE uint
+#define __CLC_S_GENTYPE int
+#define __CLC_SCALAR 1
+#define __CLC_VECSIZE
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int2
+#define __CLC_U_GENTYPE uint2
+#define __CLC_S_GENTYPE int2
+#define __CLC_VECSIZE 2
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int3
+#define __CLC_U_GENTYPE uint3
+#define __CLC_S_GENTYPE int3
+#define __CLC_VECSIZE 3
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int4
+#define __CLC_U_GENTYPE uint4
+#define __CLC_S_GENTYPE int4
+#define __CLC_VECSIZE 4
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int8
+#define __CLC_U_GENTYPE uint8
+#define __CLC_S_GENTYPE int8
+#define __CLC_VECSIZE 8
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE int16
+#define __CLC_U_GENTYPE uint16
+#define __CLC_S_GENTYPE int16
+#define __CLC_VECSIZE 16
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_SCALAR_GENTYPE
+#define __CLC_SCALAR_GENTYPE uint
+
+#define __CLC_GENTYPE uint
+#define __CLC_U_GENTYPE uint
+#define __CLC_S_GENTYPE int
+#define __CLC_SCALAR 1
+#define __CLC_VECSIZE
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_SCALAR
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint2
+#define __CLC_U_GENTYPE uint2
+#define __CLC_S_GENTYPE int2
+#define __CLC_VECSIZE 2
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint3
+#define __CLC_U_GENTYPE uint3
+#define __CLC_S_GENTYPE int3
+#define __CLC_VECSIZE 3
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint4
+#define __CLC_U_GENTYPE uint4
+#define __CLC_S_GENTYPE int4
+#define __CLC_VECSIZE 4
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint8
+#define __CLC_U_GENTYPE uint8
+#define __CLC_S_GENTYPE int8
+#define __CLC_VECSIZE 8
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#define __CLC_GENTYPE uint16
+#define __CLC_U_GENTYPE uint16
+#define __CLC_S_GENTYPE int16
+#define __CLC_VECSIZE 16
+#include __CLC_BODY
+#undef __CLC_VECSIZE
+#undef __CLC_GENTYPE
+#undef __CLC_U_GENTYPE
+#undef __CLC_S_GENTYPE
+
+#undef __CLC_GENSIZE
+#undef __CLC_SCALAR_GENTYPE
+#undef __CLC_BODY
diff --git a/libclc/clc/include/clc/integer/ternary_decl.h b/libclc/clc/include/clc/integer/ternary_decl.h
new file mode 100644
index 00000000000000..495d5c800c62ad
--- /dev/null
+++ b/libclc/clc/include/clc/integer/ternary_decl.h
@@ -0,0 +1,2 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x, __CLC_GENTYPE y,
+                                               __CLC_GENTYPE z);
diff --git a/libclc/clc/include/clc/integer/unary_decl.h b/libclc/clc/include/clc/integer/unary_decl.h
new file mode 100644
index 00000000000000..cf482efb55183e
--- /dev/null
+++ b/libclc/clc/include/clc/integer/unary_decl.h
@@ -0,0 +1 @@
+_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE FUNCTION(__CLC_GENTYPE x);
diff --git a/libclc/clc/include/clc/integer/unary_intrin.inc b/libclc/clc/include/clc/integer/unary_intrin.inc
new file mode 100644
index 00000000000000..169999a8260485
--- /dev/null
+++ b/libclc/clc/include/clc/integer/unary_intrin.inc
@@ -0,0 +1,26 @@
+#define __CLC_INTRINSIC_DEF(SCALAR_TYPE, BIT_SIZE)                             \
+  _CLC_OVERLOAD SCALAR_TYPE __CLC_FUNCTION(SCALAR_TYPE x) __asm(               \
+      __CLC_INTRINSIC ".i" BIT_SIZE);                                          \
+  _CLC_OVERLOAD SCALAR_TYPE##2 __CLC_FUNCTION(SCALAR_TYPE##2 x) __asm(         \
+      __CLC_INTRINSIC ".v2i" BIT_SIZE);                                        \
+  _CLC_OVERLOAD SCALAR_TYPE##3 __CLC_FUNCTION(SCALAR_TYPE##3 x) __asm(         \
+      __CLC_INTRINSIC ".v3i" BIT_SIZE);                                        \
+  _CLC_OVERLOAD SCALAR_TYPE##4 __CLC_FUNCTION(SCALAR_TYPE##4 x) __asm(         \
+      __CLC_INTRINSIC ".v4i" BIT_SIZE);                                        \
+  _CLC_OVERLOAD SCALAR_TYPE##8 __CLC_FUNCTION(SCALAR_TYPE##8 x) __asm(         \
+      __CLC_INTRINSIC ".v8i" BIT_SIZE);                                        \
+  _CLC_OVERLOAD SCALAR_TYPE##16 __CLC_FUNCTION(SCALAR_TYPE##16 x) __asm(       \
+      __CLC_INTRINSIC ".v16i" BIT_SIZE);
+
+__CLC_INTRINSIC_DEF(char, "8")
+__CLC_INTRINSIC_DEF(uchar, "8")
+__CLC_INTRINSIC_DEF(short, "16")
+__CLC_INTRINSIC_DEF(ushort, "16")
+__CLC_INTRINSIC_DEF(int, "32")
+__CLC_INTRINSIC_DEF(uint, "32")
+__CLC_INTRINSIC_DEF(long, "64")
+__CLC_INTRINSIC_DEF(ulong, "64")
+
+#undef __CLC_FUNCTION
+#undef __CLC_INTRINSIC
+#undef __CLC_INTRINSIC_DEF
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index d7ffaaf6dc3f42..72c5821176ce8e 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -1,6 +1,13 @@
 geometric/clc_dot.cl
 integer/clc_abs.cl
 integer/clc_abs_diff.cl
+integer/clc_clz.cl
+integer/clc_hadd.cl
+integer/clc_mad24.cl
+integer/clc_mul24.cl
+integer/clc_mul_hi.cl
+integer/clc_rhadd.cl
+integer/clc_upsample.cl
 relational/clc_all.cl
 relational/clc_any.cl
 relational/clc_bitselect.cl
diff --git a/libclc/clc/lib/generic/integer/clc_clz.cl b/libclc/clc/lib/generic/integer/clc_clz.cl
new file mode 100644
index 00000000000000..592b65f262bd6b
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_clz.cl
@@ -0,0 +1,44 @@
+#include <clc/clcmacro.h>
+#include <clc/integer/clc_clz.h>
+#include <clc/internal/clc.h>
+
+_CLC_OVERLOAD _CLC_DEF char __clc_clz(char x) {
+  return __clc_clz((ushort)(uchar)x) - 8;
+}
+
+_CLC_OVERLOAD _CLC_DEF uchar __clc_clz(uchar x) {
+  return __clc_clz((ushort)x) - 8;
+}
+
+_CLC_OVERLOAD _CLC_DEF short __clc_clz(short x) {
+  return x ? __builtin_clzs(x) : 16;
+}
+
+_CLC_OVERLOAD _CLC_DEF ushort __clc_clz(ushort x) {
+  return x ? __builtin_clzs(x) : 16;
+}
+
+_CLC_OVERLOAD _CLC_DEF int __clc_clz(int x) {
+  return x ? __builtin_clz(x) : 32;
+}
+
+_CLC_OVERLOAD _CLC_DEF uint __clc_clz(uint x) {
+  return x ? __builtin_clz(x) : 32;
+}
+
+_CLC_OVERLOAD _CLC_DEF long __clc_clz(long x) {
+  return x ? __builtin_clzl(x) : 64;
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong __clc_clz(ulong x) {
+  return x ? __builtin_clzl(x) : 64;
+}
+
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, __clc_clz, char)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, __clc_clz, uchar)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, __clc_clz, short)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, __clc_clz, ushort)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, __clc_clz, int)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, __clc_clz, uint)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, __clc_clz, long)
+_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, __clc_clz, ulong)
diff --git a/libclc/clc/lib/generic/integer/clc_hadd.cl b/libclc/clc/lib/generic/integer/clc_hadd.cl
new file mode 100644
index 00000000000000..8e91d41a843aaa
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_hadd.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_hadd.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_hadd.inc b/libclc/clc/lib/generic/integer/clc_hadd.inc
new file mode 100644
index 00000000000000..14d921599446b3
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_hadd.inc
@@ -0,0 +1,8 @@
+// hadd = (x+y)>>1
+// This can be simplified to x>>1 + y>>1 + (1 if both x and y have the 1s bit
+// set) This saves us having to do any checks for overflow in the addition sum
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_hadd(__CLC_GENTYPE x,
+                                                __CLC_GENTYPE y) {
+  return (x >> (__CLC_GENTYPE)1) + (y >> (__CLC_GENTYPE)1) +
+         (x & y & (__CLC_GENTYPE)1);
+}
diff --git a/libclc/clc/lib/generic/integer/clc_mad24.cl b/libclc/clc/lib/generic/integer/clc_mad24.cl
new file mode 100644
index 00000000000000..86c319cff6d245
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_mad24.cl
@@ -0,0 +1,5 @@
+#include <clc/internal/clc.h>
+#include <clc/integer/clc_mul24.h>
+
+#define __CLC_BODY <clc_mad24.inc>
+#include <clc/integer/gentype24.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_mad24.inc b/libclc/clc/lib/generic/integer/clc_mad24.inc
new file mode 100644
index 00000000000000..61c8587d4f86fc
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_mad24.inc
@@ -0,0 +1,5 @@
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mad24(__CLC_GENTYPE x,
+                                                 __CLC_GENTYPE y,
+                                                 __CLC_GENTYPE z) {
+  return __clc_mul24(x, y) + z;
+}
diff --git a/libclc/clc/lib/generic/integer/clc_mul24.cl b/libclc/clc/lib/generic/integer/clc_mul24.cl
new file mode 100644
index 00000000000000..6513a896a8b1d2
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_mul24.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_mul24.inc>
+#include <clc/integer/gentype24.inc>
diff --git a/libclc/generic/lib/integer/mul24.inc b/libclc/clc/lib/generic/integer/clc_mul24.inc
similarity index 68%
rename from libclc/generic/lib/integer/mul24.inc
rename to libclc/clc/lib/generic/integer/clc_mul24.inc
index 95a2f1d6f31bab..d7e8091c98a314 100644
--- a/libclc/generic/lib/integer/mul24.inc
+++ b/libclc/clc/lib/generic/integer/clc_mul24.inc
@@ -1,10 +1,10 @@
-
 // We need to use shifts here in order to mantain the sign bit for signed
 // integers.  The compiler should optimize this to (x & 0x00FFFFFF) for
 // unsigned integers.
 #define CONVERT_TO_24BIT(x) (((x) << 8) >> 8)
 
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y){
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_mul24(__CLC_GENTYPE x,
+                                                 __CLC_GENTYPE y) {
   return CONVERT_TO_24BIT(x) * CONVERT_TO_24BIT(y);
 }
 
diff --git a/libclc/clc/lib/generic/integer/clc_mul_hi.cl b/libclc/clc/lib/generic/integer/clc_mul_hi.cl
new file mode 100644
index 00000000000000..07486abac52b73
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_mul_hi.cl
@@ -0,0 +1,113 @@
+#include <clc/integer/clc_hadd.h>
+#include <clc/integer/definitions.h>
+#include <clc/internal/clc.h>
+
+// For all types EXCEPT long, which is implemented separately
+#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE)                          \
+  _CLC_OVERLOAD _CLC_DEF GENTYPE __clc_mul_hi(GENTYPE x, GENTYPE y) {          \
+    return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE);                  \
+  }
+
+// FOIL-based long mul_hi
+//
+//  Summary: Treat mul_hi(long x, long y) as:
+//  (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
+//  and b and d are the low-order parts of x and y.
+//  Thinking back to algebra, we use FOIL to do the work.
+
+_CLC_OVERLOAD _CLC_DEF long __clc_mul_hi(long x, long y) {
+  long f, o, i;
+  ulong l;
+
+  // Move the high/low halves of x/y into the lower 32-bits of variables so
+  // that we can multiply them without worrying about overflow.
+  long x_hi = x >> 32;
+  long x_lo = x & UINT_MAX;
+  long y_hi = y >> 32;
+  long y_lo = y & UINT_MAX;
+
+  // Multiply all of the components according to FOIL method
+  f = x_hi * y_hi;
+  o = x_hi * y_lo;
+  i = x_lo * y_hi;
+  l = x_lo * y_lo;
+
+  // Now add the components back together in the following steps:
+  // F: doesn't need to be modified
+  // O/I: Need to be added together.
+  // L: Shift right by 32-bits, then add into the sum of O and I
+  // Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+  //
+  // We use hadd to give us a bit of extra precision for the intermediate sums
+  // but as a result, we shift by 31 bits instead of 32
+  return (long)(f + (__clc_hadd(o, (i + (long)((ulong)l >> 32))) >> 31));
+}
+
+_CLC_OVERLOAD _CLC_DEF ulong __clc_mul_hi(ulong x, ulong y) {
+  ulong f, o, i;
+  ulong l;
+
+  // Move the high/low halves of x/y into the lower 32-bits of variables so
+  // that we can multiply them without worrying about overflow.
+  ulong x_hi = x >> 32;
+  ulong x_lo = x & UINT_MAX;
+  ulong y_hi = y >> 32;
+  ulong y_lo = y & UINT_MAX;
+
+  // Multiply all of the components according to FOIL method
+  f = x_hi * y_hi;
+  o = x_hi * y_lo;
+  i = x_lo * y_hi;
+  l = x_lo * y_lo;
+
+  // Now add the components back together, taking care to respect the fact that:
+  // F: doesn't need to be modified
+  // O/I: Need to be added together.
+  // L: Shift right by 32-bits, then add into the sum of O and I
+  // Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
+  //
+  // We use hadd to give us a bit of extra precision for the intermediate sums
+  // but as a result, we shift by 31 bits instead of 32
+  return (f + (__clc_hadd(o, (i + (l >> 32))) >> 31));
+}
+
+#define __CLC_MUL_HI_VEC(GENTYPE)                                              \
+  _CLC_OVERLOAD _CLC_DEF GENTYPE##2 __clc_mul_hi(GENTYPE##2 x, GENTYPE##2 y) { \
+    return (GENTYPE##2){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1)};   \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF GENTYPE##3 __clc_mul_hi(GENTYPE##3 x, GENTYPE##3 y) { \
+    return (GENTYPE##3){__clc_mul_hi(x.s0, y.s0), __clc_mul_hi(x.s1, y.s1),    \
+                        __clc_mul_hi(x.s2, y.s2)};                             \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF GENTYPE##4 __clc_mul_hi(GENTYPE##4 x, GENTYPE##4 y) { \
+    return (GENTYPE##4){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)};   \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF GENTYPE##8 __clc_mul_hi(GENTYPE##8 x, GENTYPE##8 y) { \
+    return (GENTYPE##8){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)};   \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF GENTYPE##16 __clc_mul_hi(GENTYPE##16 x,               \
+                                                  GENTYPE##16 y) {             \
+    return (GENTYPE##16){__clc_mul_hi(x.lo, y.lo), __clc_mul_hi(x.hi, y.hi)};  \
+  }
+
+#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS)                               \
+  __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS)                                         \
+  __CLC_MUL_HI_VEC(TYPE)
+
+#define __CLC_MUL_HI_TYPES()                                                   \
+  __CLC_MUL_HI_DEC_IMPL(short, char, 8)                                        \
+  __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8)                                      \
+  __CLC_MUL_HI_DEC_IMPL(int, short, 16)                                        \
+  __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16)                                      \
+  __CLC_MUL_HI_DEC_IMPL(long, int, 32)                                         \
+  __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32)                                       \
+  __CLC_MUL_HI_VEC(long)                                                       \
+  __CLC_MUL_HI_VEC(ulong)
+
+__CLC_MUL_HI_TYPES()
+
+#undef __CLC_MUL_HI_TYPES
+#undef __CLC_MUL_HI_DEC_IMPL
+#undef __CLC_MUL_HI_IMPL
+#undef __CLC_MUL_HI_VEC
+#undef __CLC_B32
diff --git a/libclc/clc/lib/generic/integer/clc_rhadd.cl b/libclc/clc/lib/generic/integer/clc_rhadd.cl
new file mode 100644
index 00000000000000..00bd2f0ac8058a
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_rhadd.cl
@@ -0,0 +1,4 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_BODY <clc_rhadd.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_rhadd.inc b/libclc/clc/lib/generic/integer/clc_rhadd.inc
new file mode 100644
index 00000000000000..d363c42061ffe1
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_rhadd.inc
@@ -0,0 +1,8 @@
+// rhadd = (x+y+1)>>1
+// This can be simplified to x>>1 + y>>1 + (1 if either x or y have the 1s bit
+// set) This saves us having to do any checks for overflow in the addition sums
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_rhadd(__CLC_GENTYPE x,
+                                                 __CLC_GENTYPE y) {
+  return (x >> (__CLC_GENTYPE)1) + (y >> (__CLC_GENTYPE)1) +
+         ((x & (__CLC_GENTYPE)1) | (y & (__CLC_GENTYPE)1));
+}
diff --git a/libclc/clc/lib/generic/integer/clc_upsample.cl b/libclc/clc/lib/generic/integer/clc_upsample.cl
new file mode 100644
index 00000000000000..303bb4aa39330a
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_upsample.cl
@@ -0,0 +1,45 @@
+#include <clc/internal/clc.h>
+
+#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE)              \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE __clc_upsample(GENTYPE hi, UGENTYPE lo) {    \
+    return ((BGENTYPE)hi << GENSIZE) | lo;                                     \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##2 __clc_upsample(GENTYPE##2 hi,             \
+                                                    UGENTYPE##2 lo) {          \
+    return (BGENTYPE##2){__clc_upsample(hi.s0, lo.s0),                         \
+                         __clc_upsample(hi.s1, lo.s1)};                        \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##3 __clc_upsample(GENTYPE##3 hi,             \
+                                                    UGENTYPE##3 lo) {          \
+    return (BGENTYPE##3){__clc_upsample(hi.s0, lo.s0),                         \
+                         __clc_upsample(hi.s1, lo.s1),                         \
+                         __clc_upsample(hi.s2, lo.s2)};                        \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##4 __clc_upsample(GENTYPE##4 hi,             \
+                                                    UGENTYPE##4 lo) {          \
+    return (BGENTYPE##4){__clc_upsample(hi.lo, lo.lo),                         \
+                         __clc_upsample(hi.hi, lo.hi)};                        \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##8 __clc_upsample(GENTYPE##8 hi,             \
+                                                    UGENTYPE##8 lo) {          \
+    return (BGENTYPE##8){__clc_upsample(hi.lo, lo.lo),                         \
+                         __clc_upsample(hi.hi, lo.hi)};                        \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##16 __clc_upsample(GENTYPE##16 hi,           \
+                                                     UGENTYPE##16 lo) {        \
+    return (BGENTYPE##16){__clc_upsample(hi.lo, lo.lo),                        \
+                          __clc_upsample(hi.hi, lo.hi)};                       \
+  }
+
+#define __CLC_UPSAMPLE_TYPES()                                                 \
+  __CLC_UPSAMPLE_IMPL(short, char, uchar, 8)                                   \
+  __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8)                                 \
+  __CLC_UPSAMPLE_IMPL(int, short, ushort, 16)                                  \
+  __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16)                                \
+  __CLC_UPSAMPLE_IMPL(long, int, uint, 32)                                     \
+  __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32)
+
+__CLC_UPSAMPLE_TYPES()
+
+#undef __CLC_UPSAMPLE_TYPES
+#undef __CLC_UPSAMPLE_IMPL
diff --git a/libclc/generic/include/clc/integer/clz.h b/libclc/generic/include/clc/integer/clz.h
index f7cdbf78ec0607..c50e7878810ec9 100644
--- a/libclc/generic/include/clc/integer/clz.h
+++ b/libclc/generic/include/clc/integer/clz.h
@@ -1,2 +1,6 @@
-#define __CLC_BODY <clc/integer/clz.inc>
+#define FUNCTION clz
+#define __CLC_BODY "unary_decl.h"
+
 #include <clc/integer/gentype.inc>
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/clz.inc b/libclc/generic/include/clc/integer/clz.inc
deleted file mode 100644
index 45826d10c9fafe..00000000000000
--- a/libclc/generic/include/clc/integer/clz.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE clz(__CLC_GENTYPE x);
diff --git a/libclc/generic/include/clc/integer/hadd.h b/libclc/generic/include/clc/integer/hadd.h
index 37304e26cc2d62..24ce4604c88e42 100644
--- a/libclc/generic/include/clc/integer/hadd.h
+++ b/libclc/generic/include/clc/integer/hadd.h
@@ -1,2 +1,6 @@
-#define __CLC_BODY <clc/integer/hadd.inc>
+#define FUNCTION hadd
+#define __CLC_BODY "binary_decl.h"
+
 #include <clc/integer/gentype.inc>
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/hadd.inc b/libclc/generic/include/clc/integer/hadd.inc
deleted file mode 100644
index f698989cef2026..00000000000000
--- a/libclc/generic/include/clc/integer/hadd.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE hadd(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/libclc/generic/include/clc/integer/mad24.h b/libclc/generic/include/clc/integer/mad24.h
index 0c120faac2b15b..7166f3df509cc0 100644
--- a/libclc/generic/include/clc/integer/mad24.h
+++ b/libclc/generic/include/clc/integer/mad24.h
@@ -1,3 +1,6 @@
-#define __CLC_BODY <clc/integer/mad24.inc>
-#include <clc/integer/integer-gentype.inc>
-#undef __CLC_BODY
+#define FUNCTION mad24
+#define __CLC_BODY "ternary_decl.h"
+
+#include <clc/integer/gentype24.inc>
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/mad24.inc b/libclc/generic/include/clc/integer/mad24.inc
deleted file mode 100644
index 81fe0c2a89266c..00000000000000
--- a/libclc/generic/include/clc/integer/mad24.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z);
diff --git a/libclc/generic/include/clc/integer/mad_hi.h b/libclc/generic/include/clc/integer/mad_hi.h
index 863ce92d9f2d8c..9ff70851a738ad 100644
--- a/libclc/generic/include/clc/integer/mad_hi.h
+++ b/libclc/generic/include/clc/integer/mad_hi.h
@@ -1 +1,6 @@
-#define mad_hi(a, b, c) (mul_hi((a),(b))+(c))
+#define FUNCTION mad_hi
+#define __CLC_BODY "ternary_decl.h"
+
+#include <clc/integer/gentype.inc>
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/mul24.h b/libclc/generic/include/clc/integer/mul24.h
index 4f97098d70f0f4..5a84b039260851 100644
--- a/libclc/generic/include/clc/integer/mul24.h
+++ b/libclc/generic/include/clc/integer/mul24.h
@@ -1,3 +1,6 @@
-#define __CLC_BODY <clc/integer/mul24.inc>
-#include <clc/integer/integer-gentype.inc>
-#undef __CLC_BODY
+#define FUNCTION mul24
+#define __CLC_BODY "binary_decl.h"
+
+#include <clc/integer/gentype24.inc>
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/mul24.inc b/libclc/generic/include/clc/integer/mul24.inc
deleted file mode 100644
index 8cbf7c10ac447d..00000000000000
--- a/libclc/generic/include/clc/integer/mul24.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mul24(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/libclc/generic/include/clc/integer/mul_hi.h b/libclc/generic/include/clc/integer/mul_hi.h
index 27b95d83442f97..89afdead91f173 100644
--- a/libclc/generic/include/clc/integer/mul_hi.h
+++ b/libclc/generic/include/clc/integer/mul_hi.h
@@ -1,2 +1,6 @@
-#define __CLC_BODY <clc/integer/mul_hi.inc>
+#define FUNCTION mul_hi
+#define __CLC_BODY "binary_decl.h"
+
 #include <clc/integer/gentype.inc>
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/mul_hi.inc b/libclc/generic/include/clc/integer/mul_hi.inc
deleted file mode 100644
index ce9e5c0b2c18c8..00000000000000
--- a/libclc/generic/include/clc/integer/mul_hi.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE mul_hi(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/libclc/generic/include/clc/integer/popcount.h b/libclc/generic/include/clc/integer/popcount.h
index 23335f45b6fe3f..1706822794ffb7 100644
--- a/libclc/generic/include/clc/integer/popcount.h
+++ b/libclc/generic/include/clc/integer/popcount.h
@@ -1,5 +1,6 @@
-#define __CLC_FUNCTION popcount
-#define __CLC_BODY <clc/integer/unary.inc>
+#define FUNCTION popcount
+#define __CLC_BODY "unary_decl.h"
+
 #include <clc/integer/gentype.inc>
-#undef __CLC_FUNCTION
-#undef __CLC_BODY
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/rhadd.h b/libclc/generic/include/clc/integer/rhadd.h
index 69b43faeebd246..8ea537a932ef1a 100644
--- a/libclc/generic/include/clc/integer/rhadd.h
+++ b/libclc/generic/include/clc/integer/rhadd.h
@@ -1,2 +1,6 @@
-#define __CLC_BODY <clc/integer/rhadd.inc>
+#define FUNCTION rhadd
+#define __CLC_BODY "binary_decl.h"
+
 #include <clc/integer/gentype.inc>
+
+#undef FUNCTION
diff --git a/libclc/generic/include/clc/integer/rhadd.inc b/libclc/generic/include/clc/integer/rhadd.inc
deleted file mode 100644
index 88ccaf09fd5ef8..00000000000000
--- a/libclc/generic/include/clc/integer/rhadd.inc
+++ /dev/null
@@ -1 +0,0 @@
-_CLC_OVERLOAD _CLC_DECL __CLC_GENTYPE rhadd(__CLC_GENTYPE x, __CLC_GENTYPE y);
diff --git a/libclc/generic/include/clc/integer/upsample.h b/libclc/generic/include/clc/integer/upsample.h
index 0b36b692a2c8d3..37038f6ad90344 100644
--- a/libclc/generic/include/clc/integer/upsample.h
+++ b/libclc/generic/include/clc/integer/upsample.h
@@ -1,25 +1,24 @@
-#define __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE) \
-    _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo);
+#define __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE)                       \
+  _CLC_OVERLOAD _CLC_DECL BGENTYPE upsample(GENTYPE hi, UGENTYPE lo);
 
-#define __CLC_UPSAMPLE_VEC(BGENTYPE, GENTYPE, UGENTYPE) \
-    __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE) \
-    __CLC_UPSAMPLE_DECL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2) \
-    __CLC_UPSAMPLE_DECL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3) \
-    __CLC_UPSAMPLE_DECL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4) \
-    __CLC_UPSAMPLE_DECL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8) \
-    __CLC_UPSAMPLE_DECL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16) \
+#define __CLC_UPSAMPLE_VEC(BGENTYPE, GENTYPE, UGENTYPE)                        \
+  __CLC_UPSAMPLE_DECL(BGENTYPE, GENTYPE, UGENTYPE)                             \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##2, GENTYPE##2, UGENTYPE##2)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##3, GENTYPE##3, UGENTYPE##3)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##4, GENTYPE##4, UGENTYPE##4)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##8, GENTYPE##8, UGENTYPE##8)                    \
+  __CLC_UPSAMPLE_DECL(BGENTYPE##16, GENTYPE##16, UGENTYPE##16)
 
-#define __CLC_UPSAMPLE_TYPES() \
-    __CLC_UPSAMPLE_VEC(short, char, uchar) \
-    __CLC_UPSAMPLE_VEC(ushort, uchar, uchar) \
-    __CLC_UPSAMPLE_VEC(int, short, ushort) \
-    __CLC_UPSAMPLE_VEC(uint, ushort, ushort) \
-    __CLC_UPSAMPLE_VEC(long, int, uint) \
-    __CLC_UPSAMPLE_VEC(ulong, uint, uint) \
+#define __CLC_UPSAMPLE_TYPES()                                                 \
+  __CLC_UPSAMPLE_VEC(short, char, uchar)                                       \
+  __CLC_UPSAMPLE_VEC(ushort, uchar, uchar)                                     \
+  __CLC_UPSAMPLE_VEC(int, short, ushort)                                       \
+  __CLC_UPSAMPLE_VEC(uint, ushort, ushort)                                     \
+  __CLC_UPSAMPLE_VEC(long, int, uint)                                          \
+  __CLC_UPSAMPLE_VEC(ulong, uint, uint)
 
 __CLC_UPSAMPLE_TYPES()
 
 #undef __CLC_UPSAMPLE_TYPES
 #undef __CLC_UPSAMPLE_DECL
 #undef __CLC_UPSAMPLE_VEC
-
diff --git a/libclc/generic/include/integer/popcount.h b/libclc/generic/include/integer/popcount.h
deleted file mode 100644
index 00c753753bb4e0..00000000000000
--- a/libclc/generic/include/integer/popcount.h
+++ /dev/null
@@ -1,3 +0,0 @@
-#define __CLC_FUNCTION __clc_native_popcount
-#define __CLC_INTRINSIC "llvm.ctpop"
-#include <integer/unary_intrin.inc>
diff --git a/libclc/generic/include/integer/unary_intrin.inc b/libclc/generic/include/integer/unary_intrin.inc
deleted file mode 100644
index ee9862a4c5b3a6..00000000000000
--- a/libclc/generic/include/integer/unary_intrin.inc
+++ /dev/null
@@ -1,20 +0,0 @@
-#define __CLC_INTRINSIC_DEF(SCALAR_TYPE, BIT_SIZE) \
-_CLC_OVERLOAD SCALAR_TYPE __CLC_FUNCTION(SCALAR_TYPE x) __asm(__CLC_INTRINSIC ".i" BIT_SIZE); \
-_CLC_OVERLOAD SCALAR_TYPE##2 __CLC_FUNCTION(SCALAR_TYPE##2 x) __asm(__CLC_INTRINSIC ".v2i" BIT_SIZE); \
-_CLC_OVERLOAD SCALAR_TYPE##3 __CLC_FUNCTION(SCALAR_TYPE##3 x) __asm(__CLC_INTRINSIC ".v3i" BIT_SIZE); \
-_CLC_OVERLOAD SCALAR_TYPE##4 __CLC_FUNCTION(SCALAR_TYPE##4 x) __asm(__CLC_INTRINSIC ".v4i" BIT_SIZE); \
-_CLC_OVERLOAD SCALAR_TYPE##8 __CLC_FUNCTION(SCALAR_TYPE##8 x) __asm(__CLC_INTRINSIC ".v8i" BIT_SIZE); \
-_CLC_OVERLOAD SCALAR_TYPE##16 __CLC_FUNCTION(SCALAR_TYPE##16 x) __asm(__CLC_INTRINSIC ".v16i" BIT_SIZE);
-
-__CLC_INTRINSIC_DEF(char, "8")
-__CLC_INTRINSIC_DEF(uchar, "8")
-__CLC_INTRINSIC_DEF(short, "16")
-__CLC_INTRINSIC_DEF(ushort, "16")
-__CLC_INTRINSIC_DEF(int, "32")
-__CLC_INTRINSIC_DEF(uint, "32")
-__CLC_INTRINSIC_DEF(long, "64")
-__CLC_INTRINSIC_DEF(ulong, "64")
-
-#undef __CLC_FUNCTION
-#undef __CLC_INTRINSIC
-#undef __CLC_INTRINSIC_DEF
diff --git a/libclc/generic/lib/SOURCES b/libclc/generic/lib/SOURCES
index 579e909e53d462..b862e6aa54b996 100644
--- a/libclc/generic/lib/SOURCES
+++ b/libclc/generic/lib/SOURCES
@@ -68,6 +68,7 @@ integer/add_sat.cl
 integer/clz.cl
 integer/hadd.cl
 integer/mad24.cl
+integer/mad_hi.cl
 integer/mad_sat.cl
 integer/mul24.cl
 integer/mul_hi.cl
diff --git a/libclc/generic/lib/integer/binary_def.inc b/libclc/generic/lib/integer/binary_def.inc
new file mode 100644
index 00000000000000..0f14a8c5c35f41
--- /dev/null
+++ b/libclc/generic/lib/integer/binary_def.inc
@@ -0,0 +1,8 @@
+#include <clc/utils.h>
+
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a,
+                                              __CLC_GENTYPE b) {
+  return __CLC_FUNCTION(FUNCTION)(a, b);
+}
diff --git a/libclc/generic/lib/integer/clz.cl b/libclc/generic/lib/integer/clz.cl
index 904d027d376134..bbbe8b4c642d80 100644
--- a/libclc/generic/lib/integer/clz.cl
+++ b/libclc/generic/lib/integer/clz.cl
@@ -1,43 +1,7 @@
 #include <clc/clc.h>
-#include <clc/clcmacro.h>
+#include <clc/integer/clc_clz.h>
 
-_CLC_OVERLOAD _CLC_DEF char clz(char x) {
-  return clz((ushort)(uchar)x) - 8;
-}
+#define FUNCTION clz
+#define __CLC_BODY "unary_def.inc"
 
-_CLC_OVERLOAD _CLC_DEF uchar clz(uchar x) {
-  return clz((ushort)x) - 8;
-}
-
-_CLC_OVERLOAD _CLC_DEF short clz(short x) {
-  return x ? __builtin_clzs(x) : 16;
-}
-
-_CLC_OVERLOAD _CLC_DEF ushort clz(ushort x) {
-  return x ? __builtin_clzs(x) : 16;
-}
-
-_CLC_OVERLOAD _CLC_DEF int clz(int x) {
-  return x ? __builtin_clz(x) : 32;
-}
-
-_CLC_OVERLOAD _CLC_DEF uint clz(uint x) {
-  return x ? __builtin_clz(x) : 32;
-}
-
-_CLC_OVERLOAD _CLC_DEF long clz(long x) {
-  return x ? __builtin_clzl(x) : 64;
-}
-
-_CLC_OVERLOAD _CLC_DEF ulong clz(ulong x) {
-  return x ? __builtin_clzl(x) : 64;
-}
-
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, char, clz, char)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uchar, clz, uchar)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, short, clz, short)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ushort, clz, ushort)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, int, clz, int)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, uint, clz, uint)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, long, clz, long)
-_CLC_UNARY_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, ulong, clz, ulong)
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/hadd.cl b/libclc/generic/lib/integer/hadd.cl
index 749026e5a8ad81..9fd53422b76cc3 100644
--- a/libclc/generic/lib/integer/hadd.cl
+++ b/libclc/generic/lib/integer/hadd.cl
@@ -1,4 +1,7 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_hadd.h>
+
+#define FUNCTION hadd
+#define __CLC_BODY "binary_def.inc"
 
-#define __CLC_BODY <hadd.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/hadd.inc b/libclc/generic/lib/integer/hadd.inc
deleted file mode 100644
index ea59d9bd7db5f8..00000000000000
--- a/libclc/generic/lib/integer/hadd.inc
+++ /dev/null
@@ -1,6 +0,0 @@
-//hadd = (x+y)>>1
-//This can be simplified to x>>1 + y>>1 + (1 if both x and y have the 1s bit set)
-//This saves us having to do any checks for overflow in the addition sum
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE hadd(__CLC_GENTYPE x, __CLC_GENTYPE y) {
-    return (x>>(__CLC_GENTYPE)1)+(y>>(__CLC_GENTYPE)1)+(x&y&(__CLC_GENTYPE)1);
-}
diff --git a/libclc/generic/lib/integer/mad24.cl b/libclc/generic/lib/integer/mad24.cl
index e29e99f28b56fc..db49c4aa7f10b0 100644
--- a/libclc/generic/lib/integer/mad24.cl
+++ b/libclc/generic/lib/integer/mad24.cl
@@ -1,4 +1,7 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_mad24.h>
 
-#define __CLC_BODY <mad24.inc>
-#include <clc/integer/integer-gentype.inc>
+#define FUNCTION mad24
+#define __CLC_BODY "ternary_def.inc"
+
+#include <clc/integer/gentype24.inc>
diff --git a/libclc/generic/lib/integer/mad24.inc b/libclc/generic/lib/integer/mad24.inc
deleted file mode 100644
index 902b0aafe4c874..00000000000000
--- a/libclc/generic/lib/integer/mad24.inc
+++ /dev/null
@@ -1,3 +0,0 @@
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE mad24(__CLC_GENTYPE x, __CLC_GENTYPE y, __CLC_GENTYPE z){
-  return mul24(x, y) + z;
-}
diff --git a/libclc/generic/lib/integer/mad_hi.cl b/libclc/generic/lib/integer/mad_hi.cl
new file mode 100644
index 00000000000000..21a084cc2fb31d
--- /dev/null
+++ b/libclc/generic/lib/integer/mad_hi.cl
@@ -0,0 +1,7 @@
+#include <clc/clc.h>
+#include <clc/integer/clc_mad_hi.h>
+
+#define FUNCTION mad_hi
+#define __CLC_BODY "ternary_def.inc"
+
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/mul24.cl b/libclc/generic/lib/integer/mul24.cl
index 8aedca64b85905..b4a5eca148f2b2 100644
--- a/libclc/generic/lib/integer/mul24.cl
+++ b/libclc/generic/lib/integer/mul24.cl
@@ -1,4 +1,7 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_mul24.h>
 
-#define __CLC_BODY <mul24.inc>
-#include <clc/integer/integer-gentype.inc>
+#define FUNCTION mul24
+#define __CLC_BODY "binary_def.inc"
+
+#include <clc/integer/gentype24.inc>
diff --git a/libclc/generic/lib/integer/mul_hi.cl b/libclc/generic/lib/integer/mul_hi.cl
index 174d893afb14f9..249e7a713f67f5 100644
--- a/libclc/generic/lib/integer/mul_hi.cl
+++ b/libclc/generic/lib/integer/mul_hi.cl
@@ -1,109 +1,7 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_mul_hi.h>
 
-//For all types EXCEPT long, which is implemented separately
-#define __CLC_MUL_HI_IMPL(BGENTYPE, GENTYPE, GENSIZE) \
-    _CLC_OVERLOAD _CLC_DEF GENTYPE mul_hi(GENTYPE x, GENTYPE y){ \
-        return (GENTYPE)(((BGENTYPE)x * (BGENTYPE)y) >> GENSIZE); \
-    } \
+#define FUNCTION mul_hi
+#define __CLC_BODY "binary_def.inc"
 
-//FOIL-based long mul_hi
-//
-// Summary: Treat mul_hi(long x, long y) as:
-// (a+b) * (c+d) where a and c are the high-order parts of x and y respectively
-// and b and d are the low-order parts of x and y.
-// Thinking back to algebra, we use FOIL to do the work.
-
-_CLC_OVERLOAD _CLC_DEF long mul_hi(long x, long y){
-    long f, o, i;
-    ulong l;
-
-    //Move the high/low halves of x/y into the lower 32-bits of variables so
-    //that we can multiply them without worrying about overflow.
-    long x_hi = x >> 32;
-    long x_lo = x & UINT_MAX;
-    long y_hi = y >> 32;
-    long y_lo = y & UINT_MAX;
-
-    //Multiply all of the components according to FOIL method
-    f = x_hi * y_hi;
-    o = x_hi * y_lo;
-    i = x_lo * y_hi;
-    l = x_lo * y_lo;
-
-    //Now add the components back together in the following steps:
-    //F: doesn't need to be modified
-    //O/I: Need to be added together.
-    //L: Shift right by 32-bits, then add into the sum of O and I
-    //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
-    //
-    //We use hadd to give us a bit of extra precision for the intermediate sums
-    //but as a result, we shift by 31 bits instead of 32
-    return (long)(f + (hadd(o, (i + (long)((ulong)l>>32))) >> 31));
-}
-
-_CLC_OVERLOAD _CLC_DEF ulong mul_hi(ulong x, ulong y){
-    ulong f, o, i;
-    ulong l;
-
-    //Move the high/low halves of x/y into the lower 32-bits of variables so
-    //that we can multiply them without worrying about overflow.
-    ulong x_hi = x >> 32;
-    ulong x_lo = x & UINT_MAX;
-    ulong y_hi = y >> 32;
-    ulong y_lo = y & UINT_MAX;
-
-    //Multiply all of the components according to FOIL method
-    f = x_hi * y_hi;
-    o = x_hi * y_lo;
-    i = x_lo * y_hi;
-    l = x_lo * y_lo;
-
-    //Now add the components back together, taking care to respect the fact that:
-    //F: doesn't need to be modified
-    //O/I: Need to be added together.
-    //L: Shift right by 32-bits, then add into the sum of O and I
-    //Once O/I/L are summed up, then shift the sum by 32-bits and add to F.
-    //
-    //We use hadd to give us a bit of extra precision for the intermediate sums
-    //but as a result, we shift by 31 bits instead of 32
-    return (f + (hadd(o, (i + (l>>32))) >> 31));
-}
-
-#define __CLC_MUL_HI_VEC(GENTYPE) \
-    _CLC_OVERLOAD _CLC_DEF GENTYPE##2 mul_hi(GENTYPE##2 x, GENTYPE##2 y){ \
-        return (GENTYPE##2){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF GENTYPE##3 mul_hi(GENTYPE##3 x, GENTYPE##3 y){ \
-        return (GENTYPE##3){mul_hi(x.s0, y.s0), mul_hi(x.s1, y.s1), mul_hi(x.s2, y.s2)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF GENTYPE##4 mul_hi(GENTYPE##4 x, GENTYPE##4 y){ \
-        return (GENTYPE##4){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF GENTYPE##8 mul_hi(GENTYPE##8 x, GENTYPE##8 y){ \
-        return (GENTYPE##8){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF GENTYPE##16 mul_hi(GENTYPE##16 x, GENTYPE##16 y){ \
-        return (GENTYPE##16){mul_hi(x.lo, y.lo), mul_hi(x.hi, y.hi)}; \
-    } \
-
-#define __CLC_MUL_HI_DEC_IMPL(BTYPE, TYPE, BITS) \
-    __CLC_MUL_HI_IMPL(BTYPE, TYPE, BITS) \
-    __CLC_MUL_HI_VEC(TYPE)
-
-#define __CLC_MUL_HI_TYPES() \
-    __CLC_MUL_HI_DEC_IMPL(short, char, 8) \
-    __CLC_MUL_HI_DEC_IMPL(ushort, uchar, 8) \
-    __CLC_MUL_HI_DEC_IMPL(int, short, 16) \
-    __CLC_MUL_HI_DEC_IMPL(uint, ushort, 16) \
-    __CLC_MUL_HI_DEC_IMPL(long, int, 32) \
-    __CLC_MUL_HI_DEC_IMPL(ulong, uint, 32) \
-    __CLC_MUL_HI_VEC(long) \
-    __CLC_MUL_HI_VEC(ulong)
-
-__CLC_MUL_HI_TYPES()
-
-#undef __CLC_MUL_HI_TYPES
-#undef __CLC_MUL_HI_DEC_IMPL
-#undef __CLC_MUL_HI_IMPL
-#undef __CLC_MUL_HI_VEC
-#undef __CLC_B32
+#include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/popcount.cl b/libclc/generic/lib/integer/popcount.cl
index ca83b1afaf9dab..f646e838351767 100644
--- a/libclc/generic/lib/integer/popcount.cl
+++ b/libclc/generic/lib/integer/popcount.cl
@@ -1,8 +1,7 @@
 #include <clc/clc.h>
-#include <integer/popcount.h>
+#include <clc/integer/clc_popcount.h>
 
-#define __CLC_FUNC popcount
-#define __CLC_IMPL_FUNC __clc_native_popcount
+#define FUNCTION popcount
+#define __CLC_BODY "unary_def.inc"
 
-#define __CLC_BODY "../clc_unary.inc"
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/rhadd.cl b/libclc/generic/lib/integer/rhadd.cl
index c985870f7c7a24..a919bd33f4a6cf 100644
--- a/libclc/generic/lib/integer/rhadd.cl
+++ b/libclc/generic/lib/integer/rhadd.cl
@@ -1,4 +1,7 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_rhadd.h>
+
+#define FUNCTION rhadd
+#define __CLC_BODY "binary_def.inc"
 
-#define __CLC_BODY <rhadd.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/rhadd.inc b/libclc/generic/lib/integer/rhadd.inc
deleted file mode 100644
index 3d6076874808e6..00000000000000
--- a/libclc/generic/lib/integer/rhadd.inc
+++ /dev/null
@@ -1,6 +0,0 @@
-//rhadd = (x+y+1)>>1
-//This can be simplified to x>>1 + y>>1 + (1 if either x or y have the 1s bit set)
-//This saves us having to do any checks for overflow in the addition sums
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rhadd(__CLC_GENTYPE x, __CLC_GENTYPE y) {
-    return (x>>(__CLC_GENTYPE)1)+(y>>(__CLC_GENTYPE)1)+((x&(__CLC_GENTYPE)1)|(y&(__CLC_GENTYPE)1));
-}
diff --git a/libclc/generic/lib/integer/ternary_def.inc b/libclc/generic/lib/integer/ternary_def.inc
new file mode 100644
index 00000000000000..2c43b486685b8f
--- /dev/null
+++ b/libclc/generic/lib/integer/ternary_def.inc
@@ -0,0 +1,8 @@
+#include <clc/utils.h>
+
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a, __CLC_GENTYPE b,
+                                              __CLC_GENTYPE c) {
+  return __CLC_FUNCTION(FUNCTION)(a, b, c);
+}
diff --git a/libclc/generic/lib/integer/unary_def.inc b/libclc/generic/lib/integer/unary_def.inc
new file mode 100644
index 00000000000000..762f85eedead1b
--- /dev/null
+++ b/libclc/generic/lib/integer/unary_def.inc
@@ -0,0 +1,7 @@
+#include <clc/utils.h>
+
+#define __CLC_FUNCTION(x) __CLC_CONCAT(__clc_, x)
+
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE FUNCTION(__CLC_GENTYPE a) {
+  return __CLC_FUNCTION(FUNCTION)(a);
+}
diff --git a/libclc/generic/lib/integer/upsample.cl b/libclc/generic/lib/integer/upsample.cl
index da77315f8f9344..984a731e3b4d12 100644
--- a/libclc/generic/lib/integer/upsample.cl
+++ b/libclc/generic/lib/integer/upsample.cl
@@ -1,32 +1,34 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_upsample.h>
 
-#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE, GENSIZE) \
-    _CLC_OVERLOAD _CLC_DEF BGENTYPE upsample(GENTYPE hi, UGENTYPE lo){ \
-        return ((BGENTYPE)hi << GENSIZE) | lo; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF BGENTYPE##2 upsample(GENTYPE##2 hi, UGENTYPE##2 lo){ \
-        return (BGENTYPE##2){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF BGENTYPE##3 upsample(GENTYPE##3 hi, UGENTYPE##3 lo){ \
-        return (BGENTYPE##3){upsample(hi.s0, lo.s0), upsample(hi.s1, lo.s1), upsample(hi.s2, lo.s2)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF BGENTYPE##4 upsample(GENTYPE##4 hi, UGENTYPE##4 lo){ \
-        return (BGENTYPE##4){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF BGENTYPE##8 upsample(GENTYPE##8 hi, UGENTYPE##8 lo){ \
-        return (BGENTYPE##8){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
-    } \
-    _CLC_OVERLOAD _CLC_DEF BGENTYPE##16 upsample(GENTYPE##16 hi, UGENTYPE##16 lo){ \
-        return (BGENTYPE##16){upsample(hi.lo, lo.lo), upsample(hi.hi, lo.hi)}; \
-    } \
+#define __CLC_UPSAMPLE_IMPL(BGENTYPE, GENTYPE, UGENTYPE)                       \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE upsample(GENTYPE hi, UGENTYPE lo) {          \
+    return __clc_upsample(hi, lo);                                             \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##2 upsample(GENTYPE##2 hi, UGENTYPE##2 lo) { \
+    return __clc_upsample(hi, lo);                                             \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##3 upsample(GENTYPE##3 hi, UGENTYPE##3 lo) { \
+    return __clc_upsample(hi, lo);                                             \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##4 upsample(GENTYPE##4 hi, UGENTYPE##4 lo) { \
+    return __clc_upsample(hi, lo);                                             \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##8 upsample(GENTYPE##8 hi, UGENTYPE##8 lo) { \
+    return __clc_upsample(hi, lo);                                             \
+  }                                                                            \
+  _CLC_OVERLOAD _CLC_DEF BGENTYPE##16 upsample(GENTYPE##16 hi,                 \
+                                               UGENTYPE##16 lo) {              \
+    return __clc_upsample(hi, lo);                                             \
+  }
 
-#define __CLC_UPSAMPLE_TYPES() \
-    __CLC_UPSAMPLE_IMPL(short, char, uchar, 8) \
-    __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar, 8) \
-    __CLC_UPSAMPLE_IMPL(int, short, ushort, 16) \
-    __CLC_UPSAMPLE_IMPL(uint, ushort, ushort, 16) \
-    __CLC_UPSAMPLE_IMPL(long, int, uint, 32) \
-    __CLC_UPSAMPLE_IMPL(ulong, uint, uint, 32) \
+#define __CLC_UPSAMPLE_TYPES()                                                 \
+  __CLC_UPSAMPLE_IMPL(short, char, uchar)                                      \
+  __CLC_UPSAMPLE_IMPL(ushort, uchar, uchar)                                    \
+  __CLC_UPSAMPLE_IMPL(int, short, ushort)                                      \
+  __CLC_UPSAMPLE_IMPL(uint, ushort, ushort)                                    \
+  __CLC_UPSAMPLE_IMPL(long, int, uint)                                         \
+  __CLC_UPSAMPLE_IMPL(ulong, uint, uint)
 
 __CLC_UPSAMPLE_TYPES()
 
diff --git a/libclc/generic/lib/math/clc_fma.cl b/libclc/generic/lib/math/clc_fma.cl
index 15de4c8032a932..ed23b3eba26a58 100644
--- a/libclc/generic/lib/math/clc_fma.cl
+++ b/libclc/generic/lib/math/clc_fma.cl
@@ -23,6 +23,7 @@
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
 #include <clc/integer/clc_abs.h>
+#include <clc/integer/clc_clz.h>
 #include <clc/relational/clc_isinf.h>
 #include <clc/relational/clc_isnan.h>
 #include <clc/shared/clc_max.h>
@@ -119,7 +120,7 @@ _CLC_DEF _CLC_OVERLOAD float __clc_sw_fma(float a, float b, float c) {
   }
 
   // detect overflow/underflow
-  int overflow_bits = 3 - clz(st_fma.mantissa);
+  int overflow_bits = 3 - __clc_clz(st_fma.mantissa);
 
   // adjust exponent
   st_fma.exponent += overflow_bits;
diff --git a/libclc/generic/lib/math/clc_fmod.cl b/libclc/generic/lib/math/clc_fmod.cl
index 5d101373178dd1..efe3422c475265 100644
--- a/libclc/generic/lib/math/clc_fmod.cl
+++ b/libclc/generic/lib/math/clc_fmod.cl
@@ -22,6 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/shared/clc_max.h>
@@ -88,14 +89,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_fmod(double x, double y)
     ulong xsgn = ux ^ ax;
     double dx = as_double(ax);
     int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
-    int xexp1 = 11 - (int) clz(ax & MANTBITS_DP64);
+    int xexp1 = 11 - (int) __clc_clz(ax & MANTBITS_DP64);
     xexp1 = xexp < 1 ? xexp1 : xexp;
 
     ulong uy = as_ulong(y);
     ulong ay = uy & ~SIGNBIT_DP64;
     double dy = as_double(ay);
     int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
-    int yexp1 = 11 - (int) clz(ay & MANTBITS_DP64);
+    int yexp1 = 11 - (int) __clc_clz(ay & MANTBITS_DP64);
     yexp1 = yexp < 1 ? yexp1 : yexp;
 
     // First assume |x| > |y|
diff --git a/libclc/generic/lib/math/clc_remainder.cl b/libclc/generic/lib/math/clc_remainder.cl
index 8a0ce8816fcb38..e88a2ff91e9896 100644
--- a/libclc/generic/lib/math/clc_remainder.cl
+++ b/libclc/generic/lib/math/clc_remainder.cl
@@ -22,6 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/shared/clc_max.h>
@@ -96,14 +97,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remainder(double x, double y)
     ulong xsgn = ux ^ ax;
     double dx = as_double(ax);
     int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
-    int xexp1 = 11 - (int) clz(ax & MANTBITS_DP64);
+    int xexp1 = 11 - (int) __clc_clz(ax & MANTBITS_DP64);
     xexp1 = xexp < 1 ? xexp1 : xexp;
 
     ulong uy = as_ulong(y);
     ulong ay = uy & ~SIGNBIT_DP64;
     double dy = as_double(ay);
     int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
-    int yexp1 = 11 - (int) clz(ay & MANTBITS_DP64);
+    int yexp1 = 11 - (int) __clc_clz(ay & MANTBITS_DP64);
     yexp1 = yexp < 1 ? yexp1 : yexp;
 
     int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
diff --git a/libclc/generic/lib/math/clc_remquo.cl b/libclc/generic/lib/math/clc_remquo.cl
index 8d2e5f9a74bfef..f0f69e9e192259 100644
--- a/libclc/generic/lib/math/clc_remquo.cl
+++ b/libclc/generic/lib/math/clc_remquo.cl
@@ -22,6 +22,7 @@
 
 #include <clc/clc.h>
 #include <clc/clcmacro.h>
+#include <clc/integer/clc_clz.h>
 #include <clc/math/clc_floor.h>
 #include <clc/math/clc_trunc.h>
 #include <clc/shared/clc_max.h>
@@ -135,14 +136,14 @@ _CLC_DEF _CLC_OVERLOAD double __clc_remquo(double x, double y,
   ulong xsgn = ux ^ ax;
   double dx = as_double(ax);
   int xexp = convert_int(ax >> EXPSHIFTBITS_DP64);
-  int xexp1 = 11 - (int)clz(ax & MANTBITS_DP64);
+  int xexp1 = 11 - (int)__clc_clz(ax & MANTBITS_DP64);
   xexp1 = xexp < 1 ? xexp1 : xexp;
 
   ulong uy = as_ulong(y);
   ulong ay = uy & ~SIGNBIT_DP64;
   double dy = as_double(ay);
   int yexp = convert_int(ay >> EXPSHIFTBITS_DP64);
-  int yexp1 = 11 - (int)clz(ay & MANTBITS_DP64);
+  int yexp1 = 11 - (int)__clc_clz(ay & MANTBITS_DP64);
   yexp1 = yexp < 1 ? yexp1 : yexp;
 
   int qsgn = ((ux ^ uy) & SIGNBIT_DP64) == 0UL ? 1 : -1;
diff --git a/libclc/generic/lib/math/sincos_helpers.cl b/libclc/generic/lib/math/sincos_helpers.cl
index 0adecf6978bcab..c9e04e8b00e79f 100644
--- a/libclc/generic/lib/math/sincos_helpers.cl
+++ b/libclc/generic/lib/math/sincos_helpers.cl
@@ -21,6 +21,8 @@
  */
 
 #include <clc/clc.h>
+#include <clc/integer/clc_clz.h>
+#include <clc/integer/clc_mul_hi.h>
 #include <clc/shared/clc_max.h>
 
 #include "math.h"
@@ -169,14 +171,14 @@ _CLC_DEF int __clc_argReductionSmallS(float *r, float *rr, float x)
     return (int)fnpi2 & 0x3;
 }
 
-#define FULL_MUL(A, B, HI, LO) \
-    LO = A * B; \
-    HI = mul_hi(A, B)
+#define FULL_MUL(A, B, HI, LO)                                                 \
+  LO = A * B;                                                                  \
+  HI = __clc_mul_hi(A, B)
 
-#define FULL_MAD(A, B, C, HI, LO) \
-    LO = ((A) * (B) + (C)); \
-    HI = mul_hi(A, B); \
-    HI += LO < C
+#define FULL_MAD(A, B, C, HI, LO)                                              \
+  LO = ((A) * (B) + (C));                                                      \
+  HI = __clc_mul_hi(A, B);                                                     \
+  HI += LO < C
 
 _CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
 {
@@ -269,7 +271,7 @@ _CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
     p5 = p5 ^ flip;
 
     // Find exponent and shift away leading zeroes and hidden bit
-    xe = clz(p7) + 1;
+    xe = __clc_clz(p7) + 1;
     shift = 32 - xe;
     p7 = bitalign(p7, p6, shift);
     p6 = bitalign(p6, p5, shift);
@@ -281,7 +283,7 @@ _CLC_DEF int __clc_argReductionLargeS(float *r, float *rr, float x)
     p7 = bitalign(p7, p6, 32-23);
 
     // Get 24 more bits of fraction in another float, there are not long strings of zeroes here
-    int xxe = clz(p7) + 1;
+    int xxe = __clc_clz(p7) + 1;
     p7 = bitalign(p7, p6, 32-xxe);
     float q0 = as_float(sign | ((127 - (xe + 23 + xxe)) << 23) | (p7 >> 9));