[libclc] [libclc] Move rotate to CLC library; optimize (PR #125713)

Tue Feb 4 08:32:08 PST 2025

https://github.com/frasercrmck created https://github.com/llvm/llvm-project/pull/125713

This commit moves the rotate builtin to the CLC library.

It also optimizes rotate(x, n) to generate the @llvm.fshl(x, x, n) intrinsic directly, for both scalar and vector types. The previous implementation was too cautious in its handling of the shift amount; the OpenCL rules state that the shift amount is always treated as an unsigned value modulo the bitwidth.

>From 76b633334bebd26c8eab6361a4651b976aa834f7 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Tue, 4 Feb 2025 16:26:49 +0000
Subject: [PATCH] [libclc] Move rotate to CLC library; optimize

This commit moves the rotate builtin to the CLC library.

It also optimizes rotate(x, n) to generate the @llvm.fshl(x, x, n)
intrinsic directly, for both scalar and vector types. The previous
implementation was too cautious in its handling of the shift amount; the
OpenCL rules state that the shift amount is always treated as an
unsigned value modulo the bitwidth.
---
 libclc/clc/include/clc/integer/clc_rotate.h   | 12 ++++++
 libclc/clc/lib/clspv/SOURCES                  |  1 +
 libclc/clc/lib/generic/SOURCES                |  1 +
 libclc/clc/lib/generic/integer/clc_rotate.cl  |  5 +++
 libclc/clc/lib/generic/integer/clc_rotate.inc | 22 ++++++++++
 libclc/clc/lib/spirv/SOURCES                  |  1 +
 libclc/generic/lib/integer/rotate.cl          |  5 ++-
 libclc/generic/lib/integer/rotate.inc         | 42 -------------------
 8 files changed, 46 insertions(+), 43 deletions(-)
 create mode 100644 libclc/clc/include/clc/integer/clc_rotate.h
 create mode 100644 libclc/clc/lib/generic/integer/clc_rotate.cl
 create mode 100644 libclc/clc/lib/generic/integer/clc_rotate.inc
 delete mode 100644 libclc/generic/lib/integer/rotate.inc

diff --git a/libclc/clc/include/clc/integer/clc_rotate.h b/libclc/clc/include/clc/integer/clc_rotate.h
new file mode 100644
index 00000000000000..21c945a9ae1bdc
--- /dev/null
+++ b/libclc/clc/include/clc/integer/clc_rotate.h
@@ -0,0 +1,12 @@
+#ifndef __CLC_INTEGER_CLC_ROTATE_H__
+#define __CLC_INTEGER_CLC_ROTATE_H__
+
+#define __CLC_FUNCTION __clc_rotate
+#define __CLC_BODY <clc/shared/binary_decl.inc>
+
+#include <clc/integer/gentype.inc>
+
+#undef __CLC_BODY
+#undef __CLC_FUNCTION
+
+#endif // __CLC_INTEGER_CLC_ROTATE_H__
diff --git a/libclc/clc/lib/clspv/SOURCES b/libclc/clc/lib/clspv/SOURCES
index 2581abe64f1443..e4c25bc8e86d40 100644
--- a/libclc/clc/lib/clspv/SOURCES
+++ b/libclc/clc/lib/clspv/SOURCES
@@ -6,6 +6,7 @@
 ../generic/integer/clc_mul_hi.cl
 ../generic/integer/clc_popcount.cl
 ../generic/integer/clc_rhadd.cl
+../generic/integer/clc_rotate.cl
 ../generic/integer/clc_sub_sat.cl
 ../generic/integer/clc_upsample.cl
 ../generic/math/clc_ceil.cl
diff --git a/libclc/clc/lib/generic/SOURCES b/libclc/clc/lib/generic/SOURCES
index 2f4df168f70745..4d962d06b24da8 100644
--- a/libclc/clc/lib/generic/SOURCES
+++ b/libclc/clc/lib/generic/SOURCES
@@ -12,6 +12,7 @@ integer/clc_mul24.cl
 integer/clc_mul_hi.cl
 integer/clc_popcount.cl
 integer/clc_rhadd.cl
+integer/clc_rotate.cl
 integer/clc_sub_sat.cl
 integer/clc_upsample.cl
 math/clc_ceil.cl
diff --git a/libclc/clc/lib/generic/integer/clc_rotate.cl b/libclc/clc/lib/generic/integer/clc_rotate.cl
new file mode 100644
index 00000000000000..7546862fe401e4
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_rotate.cl
@@ -0,0 +1,5 @@
+#include <clc/internal/clc.h>
+#include <clc/utils.h>
+
+#define __CLC_BODY <clc_rotate.inc>
+#include <clc/integer/gentype.inc>
diff --git a/libclc/clc/lib/generic/integer/clc_rotate.inc b/libclc/clc/lib/generic/integer/clc_rotate.inc
new file mode 100644
index 00000000000000..f144553eabd525
--- /dev/null
+++ b/libclc/clc/lib/generic/integer/clc_rotate.inc
@@ -0,0 +1,22 @@
+#define __CLC_AS_GENTYPE(x) __CLC_XCONCAT(__clc_as_, __CLC_GENTYPE)(x)
+#define __CLC_AS_U_GENTYPE(x) __CLC_XCONCAT(__clc_as_, __CLC_U_GENTYPE)(x)
+
+// The rotate(A, B) builtin left-shifts corresponding to the usual OpenCL shift
+// modulo rules. These rules state that A is left-shifted by the log2(N) least
+// significant bits in B when viewed as an unsigned integer value. Thus we don't
+// have to worry about signed shift amounts, and can perform the computation in
+// unsigned types.
+_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE __clc_rotate(__CLC_GENTYPE x,
+                                                  __CLC_GENTYPE n) {
+  __CLC_U_GENTYPE x_as_u = __CLC_AS_U_GENTYPE(x);
+  __CLC_U_GENTYPE mask = (__CLC_U_GENTYPE)(__CLC_GENSIZE - 1);
+
+  __CLC_U_GENTYPE lshift_amt = __CLC_AS_U_GENTYPE(n) & mask;
+
+  __CLC_U_GENTYPE rshift_amt =
+      (((__CLC_U_GENTYPE)__CLC_GENSIZE - lshift_amt) & mask);
+
+  __CLC_U_GENTYPE result = (x_as_u << lshift_amt) | (x_as_u >> rshift_amt);
+
+  return __CLC_AS_GENTYPE(result);
+}
diff --git a/libclc/clc/lib/spirv/SOURCES b/libclc/clc/lib/spirv/SOURCES
index ddc9e4c49d8626..613bbe3d5f5f2b 100644
--- a/libclc/clc/lib/spirv/SOURCES
+++ b/libclc/clc/lib/spirv/SOURCES
@@ -10,6 +10,7 @@
 ../generic/integer/clc_mul_hi.cl
 ../generic/integer/clc_popcount.cl
 ../generic/integer/clc_rhadd.cl
+../generic/integer/clc_rotate.cl
 ../generic/integer/clc_sub_sat.cl
 ../generic/integer/clc_upsample.cl
 ../generic/math/clc_ceil.cl
diff --git a/libclc/generic/lib/integer/rotate.cl b/libclc/generic/lib/integer/rotate.cl
index 27ce515c729331..1e72af30f33f2b 100644
--- a/libclc/generic/lib/integer/rotate.cl
+++ b/libclc/generic/lib/integer/rotate.cl
@@ -1,4 +1,7 @@
 #include <clc/clc.h>
+#include <clc/integer/clc_rotate.h>
+
+#define FUNCTION rotate
+#define __CLC_BODY <clc/shared/binary_def.inc>
 
-#define __CLC_BODY <rotate.inc>
 #include <clc/integer/gentype.inc>
diff --git a/libclc/generic/lib/integer/rotate.inc b/libclc/generic/lib/integer/rotate.inc
deleted file mode 100644
index 33bb0a85241d20..00000000000000
--- a/libclc/generic/lib/integer/rotate.inc
+++ /dev/null
@@ -1,42 +0,0 @@
-/**
- * Not necessarily optimal... but it produces correct results (at least for int)
- * If we're lucky, LLVM will recognize the pattern and produce rotate
- * instructions:
- * http://llvm.1065342.n5.nabble.com/rotate-td47679.html
- * 
- * Eventually, someone should feel free to implement an llvm-specific version
- */
-
-_CLC_OVERLOAD _CLC_DEF __CLC_GENTYPE rotate(__CLC_GENTYPE x, __CLC_GENTYPE n){
-    //Try to avoid extra work if someone's spinning the value through multiple
-    //full rotations
-    n = n % (__CLC_GENTYPE)__CLC_GENSIZE;
-
-#ifdef __CLC_SCALAR
-    if (n > 0){
-        return (x << n) | (((__CLC_U_GENTYPE)x) >> (__CLC_GENSIZE - n));
-    } else if (n == 0){
-        return x;
-    } else {
-        return ( (((__CLC_U_GENTYPE)x) >> -n) | (x << (__CLC_GENSIZE + n)) );
-    }
-#else
-    //XXX: There's a lot of __builtin_astype calls to cast everything to
-    //     unsigned ... This should be improved so that if __CLC_GENTYPE==__CLC_U_GENTYPE, no
-    //     casts are required.
-    
-    __CLC_U_GENTYPE x_1 = __builtin_astype(x, __CLC_U_GENTYPE);
-
-    //XXX: Is (__CLC_U_GENTYPE >> S__CLC_GENTYPE) | (__CLC_U_GENTYPE << S__CLC_GENTYPE) legal?
-    //     If so, then combine the amt and shifts into a single set of statements
-    
-    __CLC_U_GENTYPE amt;
-    amt = (n < (__CLC_GENTYPE)0 ? __builtin_astype((__CLC_GENTYPE)0-n, __CLC_U_GENTYPE) : (__CLC_U_GENTYPE)0);
-    x_1 = (x_1 >> amt) | (x_1 << ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt));
-
-    amt = (n < (__CLC_GENTYPE)0 ? (__CLC_U_GENTYPE)0 : __builtin_astype(n, __CLC_U_GENTYPE));
-    x_1 = (x_1 << amt) | (x_1 >> ((__CLC_U_GENTYPE)__CLC_GENSIZE - amt));
-
-    return __builtin_astype(x_1, __CLC_GENTYPE);
-#endif
-}