[libclc] 299a278 - [libclc] Improving vector code generated from scalar code (#140008)
via cfe-commits
cfe-commits at lists.llvm.org
Fri May 16 02:20:36 PDT 2025
Author: Wenju He
Date: 2025-05-16T10:20:32+01:00
New Revision: 299a278db16fa0944472af79bfec31dd678c5b37
URL: https://github.com/llvm/llvm-project/commit/299a278db16fa0944472af79bfec31dd678c5b37
DIFF: https://github.com/llvm/llvm-project/commit/299a278db16fa0944472af79bfec31dd678c5b37.diff
LOG: [libclc] Improving vector code generated from scalar code (#140008)
The previous method splits vector data into two halves. shuffle_vector
concatenates the two results into a vector data of original size. This
PR eliminates the use of shuffle_vector.
Added:
Modified:
libclc/clc/include/clc/clcmacro.h
libclc/clc/lib/generic/math/clc_lgamma_r.cl
Removed:
################################################################################
diff --git a/libclc/clc/include/clc/clcmacro.h b/libclc/clc/include/clc/clcmacro.h
index d8772ce38792a..c9f70d2998d37 100644
--- a/libclc/clc/include/clc/clcmacro.h
+++ b/libclc/clc/include/clc/clcmacro.h
@@ -14,100 +14,140 @@
#define _CLC_UNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE) \
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x) { \
- return (RET_TYPE##2)(FUNCTION(x.x), FUNCTION(x.y)); \
+ return (RET_TYPE##2)(FUNCTION(x.s0), FUNCTION(x.s1)); \
} \
\
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x) { \
- return (RET_TYPE##3)(FUNCTION(x.x), FUNCTION(x.y), FUNCTION(x.z)); \
+ return (RET_TYPE##3)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2)); \
} \
\
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x) { \
- return (RET_TYPE##4)(FUNCTION(x.lo), FUNCTION(x.hi)); \
+ return (RET_TYPE##4)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \
+ FUNCTION(x.s3)); \
} \
\
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x) { \
- return (RET_TYPE##8)(FUNCTION(x.lo), FUNCTION(x.hi)); \
+ return (RET_TYPE##8)(FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), \
+ FUNCTION(x.s3), FUNCTION(x.s4), FUNCTION(x.s5), \
+ FUNCTION(x.s6), FUNCTION(x.s7)); \
} \
\
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x) { \
- return (RET_TYPE##16)(FUNCTION(x.lo), FUNCTION(x.hi)); \
+ return (RET_TYPE##16)( \
+ FUNCTION(x.s0), FUNCTION(x.s1), FUNCTION(x.s2), FUNCTION(x.s3), \
+ FUNCTION(x.s4), FUNCTION(x.s5), FUNCTION(x.s6), FUNCTION(x.s7), \
+ FUNCTION(x.s8), FUNCTION(x.s9), FUNCTION(x.sa), FUNCTION(x.sb), \
+ FUNCTION(x.sc), FUNCTION(x.sd), FUNCTION(x.se), FUNCTION(x.sf)); \
}
#define _CLC_BINARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
ARG2_TYPE) \
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y) { \
- return (RET_TYPE##2)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y)); \
+ return (RET_TYPE##2)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1)); \
} \
\
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y) { \
- return (RET_TYPE##3)(FUNCTION(x.x, y.x), FUNCTION(x.y, y.y), \
- FUNCTION(x.z, y.z)); \
+ return (RET_TYPE##3)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \
+ FUNCTION(x.s2, y.s2)); \
} \
\
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y) { \
- return (RET_TYPE##4)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
+ return (RET_TYPE##4)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \
+ FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3)); \
} \
\
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y) { \
- return (RET_TYPE##8)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
+ return (RET_TYPE##8)(FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), \
+ FUNCTION(x.s2, y.s2), FUNCTION(x.s3, y.s3), \
+ FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \
+ FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7)); \
} \
\
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y) { \
- return (RET_TYPE##16)(FUNCTION(x.lo, y.lo), FUNCTION(x.hi, y.hi)); \
+ return (RET_TYPE##16)( \
+ FUNCTION(x.s0, y.s0), FUNCTION(x.s1, y.s1), FUNCTION(x.s2, y.s2), \
+ FUNCTION(x.s3, y.s3), FUNCTION(x.s4, y.s4), FUNCTION(x.s5, y.s5), \
+ FUNCTION(x.s6, y.s6), FUNCTION(x.s7, y.s7), FUNCTION(x.s8, y.s8), \
+ FUNCTION(x.s9, y.s9), FUNCTION(x.sa, y.sa), FUNCTION(x.sb, y.sb), \
+ FUNCTION(x.sc, y.sc), FUNCTION(x.sd, y.sd), FUNCTION(x.se, y.se), \
+ FUNCTION(x.sf, y.sf)); \
}
#define _CLC_V_S_V_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
ARG2_TYPE) \
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE x, ARG2_TYPE##2 y) { \
- return (RET_TYPE##2)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ return (RET_TYPE##2)(FUNCTION(x, y.s0), FUNCTION(x, y.s1)); \
} \
\
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE x, ARG2_TYPE##3 y) { \
- return (RET_TYPE##3)(FUNCTION(x, y.x), FUNCTION(x, y.y), \
- FUNCTION(x, y.z)); \
+ return (RET_TYPE##3)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \
+ FUNCTION(x, y.s2)); \
} \
\
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE x, ARG2_TYPE##4 y) { \
- return (RET_TYPE##4)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ return (RET_TYPE##4)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \
+ FUNCTION(x, y.s2), FUNCTION(x, y.s3)); \
} \
\
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE x, ARG2_TYPE##8 y) { \
- return (RET_TYPE##8)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ return (RET_TYPE##8)(FUNCTION(x, y.s0), FUNCTION(x, y.s1), \
+ FUNCTION(x, y.s2), FUNCTION(x, y.s3), \
+ FUNCTION(x, y.s4), FUNCTION(x, y.s5), \
+ FUNCTION(x, y.s6), FUNCTION(x, y.s7)); \
} \
\
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE x, ARG2_TYPE##16 y) { \
- return (RET_TYPE##16)(FUNCTION(x, y.lo), FUNCTION(x, y.hi)); \
+ return (RET_TYPE##16)( \
+ FUNCTION(x, y.s0), FUNCTION(x, y.s1), FUNCTION(x, y.s2), \
+ FUNCTION(x, y.s3), FUNCTION(x, y.s4), FUNCTION(x, y.s5), \
+ FUNCTION(x, y.s6), FUNCTION(x, y.s7), FUNCTION(x, y.s8), \
+ FUNCTION(x, y.s9), FUNCTION(x, y.sa), FUNCTION(x, y.sb), \
+ FUNCTION(x, y.sc), FUNCTION(x, y.sd), FUNCTION(x, y.se), \
+ FUNCTION(x, y.sf)); \
}
#define _CLC_TERNARY_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
ARG2_TYPE, ARG3_TYPE) \
DECLSPEC RET_TYPE##2 FUNCTION(ARG1_TYPE##2 x, ARG2_TYPE##2 y, \
ARG3_TYPE##2 z) { \
- return (RET_TYPE##2)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y)); \
+ return (RET_TYPE##2)(FUNCTION(x.s0, y.s0, z.s0), \
+ FUNCTION(x.s1, y.s1, z.s1)); \
} \
\
DECLSPEC RET_TYPE##3 FUNCTION(ARG1_TYPE##3 x, ARG2_TYPE##3 y, \
ARG3_TYPE##3 z) { \
- return (RET_TYPE##3)(FUNCTION(x.x, y.x, z.x), FUNCTION(x.y, y.y, z.y), \
- FUNCTION(x.z, y.z, z.z)); \
+ return (RET_TYPE##3)(FUNCTION(x.s0, y.s0, z.s0), \
+ FUNCTION(x.s1, y.s1, z.s1), \
+ FUNCTION(x.s2, y.s2, z.s2)); \
} \
\
DECLSPEC RET_TYPE##4 FUNCTION(ARG1_TYPE##4 x, ARG2_TYPE##4 y, \
ARG3_TYPE##4 z) { \
- return (RET_TYPE##4)(FUNCTION(x.lo, y.lo, z.lo), \
- FUNCTION(x.hi, y.hi, z.hi)); \
+ return (RET_TYPE##4)( \
+ FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \
+ FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3)); \
} \
\
DECLSPEC RET_TYPE##8 FUNCTION(ARG1_TYPE##8 x, ARG2_TYPE##8 y, \
ARG3_TYPE##8 z) { \
- return (RET_TYPE##8)(FUNCTION(x.lo, y.lo, z.lo), \
- FUNCTION(x.hi, y.hi, z.hi)); \
+ return (RET_TYPE##8)( \
+ FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \
+ FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \
+ FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \
+ FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7)); \
} \
\
DECLSPEC RET_TYPE##16 FUNCTION(ARG1_TYPE##16 x, ARG2_TYPE##16 y, \
ARG3_TYPE##16 z) { \
- return (RET_TYPE##16)(FUNCTION(x.lo, y.lo, z.lo), \
- FUNCTION(x.hi, y.hi, z.hi)); \
+ return (RET_TYPE##16)( \
+ FUNCTION(x.s0, y.s0, z.s0), FUNCTION(x.s1, y.s1, z.s1), \
+ FUNCTION(x.s2, y.s2, z.s2), FUNCTION(x.s3, y.s3, z.s3), \
+ FUNCTION(x.s4, y.s4, z.s4), FUNCTION(x.s5, y.s5, z.s5), \
+ FUNCTION(x.s6, y.s6, z.s6), FUNCTION(x.s7, y.s7, z.s7), \
+ FUNCTION(x.s8, y.s8, z.s8), FUNCTION(x.s9, y.s9, z.s9), \
+ FUNCTION(x.sa, y.sa, z.sa), FUNCTION(x.sb, y.sb, z.sb), \
+ FUNCTION(x.sc, y.sc, z.sc), FUNCTION(x.sd, y.sd, z.sd), \
+ FUNCTION(x.se, y.se, z.se), FUNCTION(x.sf, y.sf, z.sf)); \
}
#define _CLC_V_V_VP_VECTORIZE(DECLSPEC, RET_TYPE, FUNCTION, ARG1_TYPE, \
@@ -115,48 +155,53 @@
DECLSPEC __CLC_XCONCAT(RET_TYPE, 2) \
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 2) x, \
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) * y) { \
- return (__CLC_XCONCAT(RET_TYPE, 2))( \
- FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \
- FUNCTION(x.y, \
- (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1))); \
+ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
+ return (__CLC_XCONCAT(RET_TYPE, 2))(FUNCTION(x.s0, ptr), \
+ FUNCTION(x.s1, ptr + 1)); \
} \
\
DECLSPEC __CLC_XCONCAT(RET_TYPE, 3) \
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 3) x, \
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 3) * y) { \
- return (__CLC_XCONCAT(RET_TYPE, 3))( \
- FUNCTION(x.x, (ADDR_SPACE ARG2_TYPE *)y), \
- FUNCTION(x.y, \
- (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 1)), \
- FUNCTION(x.z, \
- (ADDR_SPACE ARG2_TYPE *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \
+ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
+ return (__CLC_XCONCAT(RET_TYPE, 3))(FUNCTION(x.s0, ptr), \
+ FUNCTION(x.s1, ptr + 1), \
+ FUNCTION(x.s2, ptr + 2)); \
} \
\
DECLSPEC __CLC_XCONCAT(RET_TYPE, 4) \
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 4) x, \
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) * y) { \
+ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
return (__CLC_XCONCAT(RET_TYPE, 4))( \
- FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 2) *)y), \
- FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \
- ARG2_TYPE, 2) *)((ADDR_SPACE ARG2_TYPE *)y + 2))); \
+ FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \
+ FUNCTION(x.s3, ptr + 3)); \
} \
\
DECLSPEC __CLC_XCONCAT(RET_TYPE, 8) \
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 8) x, \
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) * y) { \
+ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
return (__CLC_XCONCAT(RET_TYPE, 8))( \
- FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 4) *)y), \
- FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \
- ARG2_TYPE, 4) *)((ADDR_SPACE ARG2_TYPE *)y + 4))); \
+ FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \
+ FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \
+ FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \
+ FUNCTION(x.s7, ptr + 7)); \
} \
\
DECLSPEC __CLC_XCONCAT(RET_TYPE, 16) \
FUNCTION(__CLC_XCONCAT(ARG1_TYPE, 16) x, \
ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 16) * y) { \
+ ADDR_SPACE ARG2_TYPE *ptr = (ADDR_SPACE ARG2_TYPE *)y; \
return (__CLC_XCONCAT(RET_TYPE, 16))( \
- FUNCTION(x.lo, (ADDR_SPACE __CLC_XCONCAT(ARG2_TYPE, 8) *)y), \
- FUNCTION(x.hi, (ADDR_SPACE __CLC_XCONCAT( \
- ARG2_TYPE, 8) *)((ADDR_SPACE ARG2_TYPE *)y + 8))); \
+ FUNCTION(x.s0, ptr), FUNCTION(x.s1, ptr + 1), FUNCTION(x.s2, ptr + 2), \
+ FUNCTION(x.s3, ptr + 3), FUNCTION(x.s4, ptr + 4), \
+ FUNCTION(x.s5, ptr + 5), FUNCTION(x.s6, ptr + 6), \
+ FUNCTION(x.s7, ptr + 7), FUNCTION(x.s8, ptr + 8), \
+ FUNCTION(x.s9, ptr + 9), FUNCTION(x.sa, ptr + 10), \
+ FUNCTION(x.sb, ptr + 11), FUNCTION(x.sc, ptr + 12), \
+ FUNCTION(x.sd, ptr + 13), FUNCTION(x.se, ptr + 14), \
+ FUNCTION(x.sf, ptr + 15)); \
}
#define _CLC_DEFINE_BINARY_BUILTIN(RET_TYPE, FUNCTION, BUILTIN, ARG1_TYPE, \
diff --git a/libclc/clc/lib/generic/math/clc_lgamma_r.cl b/libclc/clc/lib/generic/math/clc_lgamma_r.cl
index ad3d63b734eca..96a42bbb6e158 100644
--- a/libclc/clc/lib/generic/math/clc_lgamma_r.cl
+++ b/libclc/clc/lib/generic/math/clc_lgamma_r.cl
@@ -406,13 +406,13 @@ _CLC_V_V_VP_VECTORIZE(_CLC_OVERLOAD _CLC_DEF, float, __clc_lgamma_r, float,
#define v4 1.04222645593369134254e-01 /* 0x3FBAAE55, 0xD6537C88 */
#define v5 3.21709242282423911810e-03 /* 0x3F6A5ABB, 0x57D0CF61 */
-#define s0 -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */
-#define s1 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */
-#define s2 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */
-#define s3 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */
-#define s4 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */
-#define s5 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */
-#define s6 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */
+#define s0_d -7.72156649015328655494e-02 /* 0xBFB3C467, 0xE37DB0C8 */
+#define s1_d 2.14982415960608852501e-01 /* 0x3FCB848B, 0x36E20878 */
+#define s2_d 3.25778796408930981787e-01 /* 0x3FD4D98F, 0x4F139F59 */
+#define s3_d 1.46350472652464452805e-01 /* 0x3FC2BB9C, 0xBEE5F2F7 */
+#define s4_d 2.66422703033638609560e-02 /* 0x3F9B481C, 0x7E939961 */
+#define s5_d 1.84028451407337715652e-03 /* 0x3F5E26B6, 0x7368F239 */
+#define s6_d 3.19475326584100867617e-05 /* 0x3F00BFEC, 0xDD17E945 */
#define r1 1.39200533467621045958e+00 /* 0x3FF645A7, 0x62C4AB74 */
#define r2 7.21935547567138069525e-01 /* 0x3FE71A18, 0x93D3DCDC */
@@ -530,10 +530,12 @@ _CLC_OVERLOAD _CLC_DEF double __clc_lgamma_r(double x, private int *ip) {
__clc_fma(
y,
__clc_fma(
- y, __clc_fma(y, __clc_fma(y, __clc_fma(y, s6, s5), s4), s3),
- s2),
- s1),
- s0);
+ y,
+ __clc_fma(y, __clc_fma(y, __clc_fma(y, s6_d, s5_d), s4_d),
+ s3_d),
+ s2_d),
+ s1_d),
+ s0_d);
double q = __clc_fma(
y,
__clc_fma(
More information about the cfe-commits
mailing list