[libclc] [libclc] Optimize CLC vector relational builtins (PR #124537)

Mon Jan 27 04:38:00 PST 2025

https://github.com/frasercrmck created https://github.com/llvm/llvm-project/pull/124537

Clang knows how to perform relational operations on OpenCL vectors, so we don't need to use the Clang builtins. The builtins we were using didn't support vector types, so we were previously scalarizing.

This commit generates the same LLVM fcmp operations as before, just without the scalarization.

>From bc7ce5f3ab9e796475d017d8b169b916adc1e489 Mon Sep 17 00:00:00 2001
From: Fraser Cormack <fraser at codeplay.com>
Date: Mon, 27 Jan 2025 12:35:51 +0000
Subject: [PATCH] [libclc] Optimize CLC vector relational builtins

Clang knows how to perform relational operations on OpenCL vectors, so
we don't need to use the Clang builtins. The builtins we were using
didn't support vector types, so we were previously scalarizing.

This commit generates the same LLVM fcmp operations as before, just
without the scalarization.
---
 .../clc/include/clc/relational/relational.h   | 26 +++++++++++++
 .../clc/lib/generic/relational/clc_isequal.cl | 38 ++++++-------------
 .../lib/generic/relational/clc_isgreater.cl   | 23 +++--------
 .../generic/relational/clc_isgreaterequal.cl  | 30 ++++++---------
 .../clc/lib/generic/relational/clc_isless.cl  | 31 ++++++---------
 .../lib/generic/relational/clc_islessequal.cl | 24 ++++--------
 .../generic/relational/clc_islessgreater.cl   | 27 +++++--------
 .../lib/generic/relational/clc_isnotequal.cl  | 21 ++++------
 8 files changed, 90 insertions(+), 130 deletions(-)

diff --git a/libclc/clc/include/clc/relational/relational.h b/libclc/clc/include/clc/relational/relational.h
index 54241b6493c8e7..f32e7630203e4b 100644
--- a/libclc/clc/include/clc/relational/relational.h
+++ b/libclc/clc/include/clc/relational/relational.h
@@ -142,4 +142,30 @@
   _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(RET_TYPE, FUNCTION, ARG0_TYPE,         \
                                         ARG1_TYPE)
 
+#define _CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(RET_TYPE, RET_TYPE_VEC, FUNCTION, \
+                                             ARG1_TYPE, ARG2_TYPE)             \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##2 FUNCTION(ARG1_TYPE##2 x,              \
+                                                  ARG2_TYPE##2 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##3 FUNCTION(ARG1_TYPE##3 x,              \
+                                                  ARG2_TYPE##3 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##4 FUNCTION(ARG1_TYPE##4 x,              \
+                                                  ARG2_TYPE##4 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##8 FUNCTION(ARG1_TYPE##8 x,              \
+                                                  ARG2_TYPE##8 y) {            \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }                                                                            \
+  _CLC_DEF _CLC_OVERLOAD RET_TYPE_VEC##16 FUNCTION(ARG1_TYPE##16 x,            \
+                                                   ARG2_TYPE##16 y) {          \
+    return _CLC_RELATIONAL_OP(x, y);                                           \
+  }
+
 #endif // __CLC_RELATIONAL_RELATIONAL_H__
diff --git a/libclc/clc/lib/generic/relational/clc_isequal.cl b/libclc/clc/lib/generic/relational/clc_isequal.cl
index 7664df7767cb3f..053a237289fd60 100644
--- a/libclc/clc/lib/generic/relational/clc_isequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isequal.cl
@@ -1,44 +1,28 @@
 #include <clc/internal/clc.h>
+#include <clc/relational/relational.h>
 
-#define _CLC_DEFINE_ISEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)          \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return (x == y);                                                           \
-  }
+#define _CLC_RELATIONAL_OP(X, Y) (X) == (Y)
 
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, float, float)
-_CLC_DEFINE_ISEQUAL(int2, __clc_isequal, float2, float2)
-_CLC_DEFINE_ISEQUAL(int3, __clc_isequal, float3, float3)
-_CLC_DEFINE_ISEQUAL(int4, __clc_isequal, float4, float4)
-_CLC_DEFINE_ISEQUAL(int8, __clc_isequal, float8, float8)
-_CLC_DEFINE_ISEQUAL(int16, __clc_isequal, float16, float16)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isequal, float, float)
 
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-// The scalar version of __clc_isequal(double) returns an int, but the vector
-// versions return long.
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, double, double)
-_CLC_DEFINE_ISEQUAL(long2, __clc_isequal, double2, double2)
-_CLC_DEFINE_ISEQUAL(long3, __clc_isequal, double3, double3)
-_CLC_DEFINE_ISEQUAL(long4, __clc_isequal, double4, double4)
-_CLC_DEFINE_ISEQUAL(long8, __clc_isequal, double8, double8)
-_CLC_DEFINE_ISEQUAL(long16, __clc_isequal, double16, double16)
+// The scalar version of __clc_isequal(double, double) returns an int, but the
+// vector versions return long.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isequal, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// The scalar version of __clc_isequal(half) returns an int, but the vector
-// versions return short.
-_CLC_DEFINE_ISEQUAL(int, __clc_isequal, half, half)
-_CLC_DEFINE_ISEQUAL(short2, __clc_isequal, half2, half2)
-_CLC_DEFINE_ISEQUAL(short3, __clc_isequal, half3, half3)
-_CLC_DEFINE_ISEQUAL(short4, __clc_isequal, half4, half4)
-_CLC_DEFINE_ISEQUAL(short8, __clc_isequal, half8, half8)
-_CLC_DEFINE_ISEQUAL(short16, __clc_isequal, half16, half16)
+// The scalar version of __clc_isequal(half, half) returns an int, but the
+// vector versions return short.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isequal, half, half)
 
 #endif
 
-#undef _CLC_DEFINE_ISEQUAL
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isgreater.cl b/libclc/clc/lib/generic/relational/clc_isgreater.cl
index 39fb6b07fb1859..ec14fa9a2ec08e 100644
--- a/libclc/clc/lib/generic/relational/clc_isgreater.cl
+++ b/libclc/clc/lib/generic/relational/clc_isgreater.cl
@@ -1,12 +1,9 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_isgreater with vector inputs, but it
-// seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) > (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float,
-                              float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreater, float, float)
 
 #ifdef cl_khr_fp64
 
@@ -14,12 +11,7 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreater, __builtin_isgreater, float,
 
 // The scalar version of __clc_isgreater(double, double) returns an int, but the
 // vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(double x, double y) {
-  return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreater, double, double)
 
 #endif
 
@@ -29,11 +21,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreater, double, double)
 
 // The scalar version of __clc_isgreater(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreater(half x, half y) {
-  return __builtin_isgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreater, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreater, half, half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
index ccf7c881a5549b..e96f2325cbad4c 100644
--- a/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isgreaterequal.cl
@@ -1,12 +1,10 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_isgreaterequal with vector inputs,
-// but it seems to only take scalar values as input, which will produce
-// incorrect output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) >= (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal,
-                              __builtin_isgreaterequal, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isgreaterequal, float,
+                                     float)
 
 #ifdef cl_khr_fp64
 
@@ -14,26 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isgreaterequal,
 
 // The scalar version of __clc_isgreaterequal(double, double) returns an int,
 // but the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(double x, double y) {
-  return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isgreaterequal, double,
-                                      double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isgreaterequal, double,
+                                     double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// The scalar version of __clc_isgreaterequal(half, half) returns an int, but
+// The scalar version of __clc_isgreaterequal(half, hafl) returns an int, but
 // the vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isgreaterequal(half x, half y) {
-  return __builtin_isgreaterequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isgreaterequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isgreaterequal, half,
+                                     half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isless.cl b/libclc/clc/lib/generic/relational/clc_isless.cl
index 1204a5057d8640..0ce001d31d6965 100644
--- a/libclc/clc/lib/generic/relational/clc_isless.cl
+++ b/libclc/clc/lib/generic/relational/clc_isless.cl
@@ -1,37 +1,28 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_isless with vector inputs, but it
-// seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) < (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_isless, __builtin_isless, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isless, float, float)
 
 #ifdef cl_khr_fp64
 
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
-// The scalar version of __clc_isless(double, double) returns an int, but the
-// vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isless(double x, double y) {
-  return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isless, double, double)
+// The scalar version of __clc_isless(double, double) returns an int, but
+// the vector versions return long.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isless, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
-// The scalar version of __clc_isless(half, half) returns an int, but the vector
-// versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_isless(half x, half y) {
-  return __builtin_isless(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isless, half, half)
+// The scalar version of __clc_isless(half, half) returns an int, but the
+// vector versions return short.
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isless, half, half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_islessequal.cl b/libclc/clc/lib/generic/relational/clc_islessequal.cl
index 6fde763263e2b0..2d1d6d199fdab4 100644
--- a/libclc/clc/lib/generic/relational/clc_islessequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_islessequal.cl
@@ -1,12 +1,9 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_islessequal with vector inputs, but
-// it seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) (X) <= (Y)
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal,
-                              float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessequal, float, float)
 
 #ifdef cl_khr_fp64
 
@@ -14,12 +11,8 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessequal, __builtin_islessequal,
 
 // The scalar version of __clc_islessequal(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(double x, double y) {
-  return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessequal, double,
+                                     double)
 
 #endif
 
@@ -29,11 +22,8 @@ _CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessequal, double, double)
 
 // The scalar version of __clc_islessequal(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessequal(half x, half y) {
-  return __builtin_islessequal(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessequal, half, half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_islessgreater.cl b/libclc/clc/lib/generic/relational/clc_islessgreater.cl
index 5106c9f460e2ca..3ca3c37731d15d 100644
--- a/libclc/clc/lib/generic/relational/clc_islessgreater.cl
+++ b/libclc/clc/lib/generic/relational/clc_islessgreater.cl
@@ -1,12 +1,10 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-// Note: It would be nice to use __builtin_islessgreater with vector inputs, but
-// it seems to only take scalar values as input, which will produce incorrect
-// output for vector input types.
+#define _CLC_RELATIONAL_OP(X, Y) ((X) < (Y)) || ((X) > (Y))
 
-_CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater,
-                              float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_islessgreater, float,
+                                     float)
 
 #ifdef cl_khr_fp64
 
@@ -14,25 +12,20 @@ _CLC_DEFINE_RELATIONAL_BINARY(int, __clc_islessgreater, __builtin_islessgreater,
 
 // The scalar version of __clc_islessgreater(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(double x, double y) {
-  return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_islessgreater, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_islessgreater, double,
+                                     double)
 
 #endif
+
 #ifdef cl_khr_fp16
 
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_islessgreater(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEF _CLC_OVERLOAD int __clc_islessgreater(half x, half y) {
-  return __builtin_islessgreater(x, y);
-}
-
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_islessgreater, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_islessgreater, half,
+                                     half)
 
 #endif
+
+#undef _CLC_RELATIONAL_OP
diff --git a/libclc/clc/lib/generic/relational/clc_isnotequal.cl b/libclc/clc/lib/generic/relational/clc_isnotequal.cl
index 9f90713b2da508..d1ee4deab25c80 100644
--- a/libclc/clc/lib/generic/relational/clc_isnotequal.cl
+++ b/libclc/clc/lib/generic/relational/clc_isnotequal.cl
@@ -1,33 +1,28 @@
 #include <clc/internal/clc.h>
 #include <clc/relational/relational.h>
 
-#define _CLC_DEFINE_ISNOTEQUAL(RET_TYPE, FUNCTION, ARG1_TYPE, ARG2_TYPE)       \
-  _CLC_DEF _CLC_OVERLOAD RET_TYPE FUNCTION(ARG1_TYPE x, ARG2_TYPE y) {         \
-    return (x != y);                                                           \
-  }
+#define _CLC_RELATIONAL_OP(X, Y) (X) != (Y)
 
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, float, float)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(int, __clc_isnotequal, float, float)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, int, __clc_isnotequal, float, float)
 
 #ifdef cl_khr_fp64
+
 #pragma OPENCL EXTENSION cl_khr_fp64 : enable
 
 // The scalar version of __clc_isnotequal(double, double) returns an int, but
 // the vector versions return long.
-
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, double, double)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(long, __clc_isnotequal, double, double)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, long, __clc_isnotequal, double, double)
 
 #endif
+
 #ifdef cl_khr_fp16
+
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
 
 // The scalar version of __clc_isnotequal(half, half) returns an int, but the
 // vector versions return short.
-
-_CLC_DEFINE_ISNOTEQUAL(int, __clc_isnotequal, half, half)
-_CLC_DEFINE_RELATIONAL_BINARY_VEC_ALL(short, __clc_isnotequal, half, half)
+_CLC_DEFINE_SIMPLE_RELATIONAL_BINARY(int, short, __clc_isnotequal, half, half)
 
 #endif
 
-#undef _CLC_DEFINE_ISNOTEQUAL
+#undef _CLC_RELATIONAL_OP