[libclc] libclc: Add subgroup scan functions (PR #188828)
Matt Arsenault via cfe-commits
cfe-commits at lists.llvm.org
Fri Mar 27 01:24:35 PDT 2026
https://github.com/arsenm updated https://github.com/llvm/llvm-project/pull/188828
>From af35fee134b78932a9377f8808a217de69a001cc Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Tue, 10 Mar 2026 17:10:52 +0100
Subject: [PATCH 1/3] libclc: Add subgroup scan functions
Add the base implementation using ds_swizzle which should work
on all subtargets. There are at least 2 more paths available for
newer targets.
---
.../clc/subgroup/clc_sub_group_broadcast.h | 2 +
.../include/clc/subgroup/clc_sub_group_scan.h | 20 +++
.../clc/subgroup/clc_sub_group_scan.inc | 27 ++++
libclc/clc/lib/amdgpu/CMakeLists.txt | 1 +
.../amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc | 87 ++++++++++++
.../lib/amdgpu/subgroup/clc_sub_group_scan.cl | 133 ++++++++++++++++++
.../amdgpu/subgroup/clc_sub_group_scan.inc | 83 +++++++++++
libclc/opencl/lib/generic/CMakeLists.txt | 2 +
.../subgroup/sub_group_scan_exclusive.cl | 15 ++
.../subgroup/sub_group_scan_exclusive.inc | 28 ++++
.../subgroup/sub_group_scan_inclusive.cl | 15 ++
.../subgroup/sub_group_scan_inclusive.inc | 28 ++++
12 files changed, 441 insertions(+)
create mode 100644 libclc/clc/include/clc/subgroup/clc_sub_group_scan.h
create mode 100644 libclc/clc/include/clc/subgroup/clc_sub_group_scan.inc
create mode 100644 libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
create mode 100644 libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
create mode 100644 libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.inc
create mode 100644 libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.cl
create mode 100644 libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.inc
create mode 100644 libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.cl
create mode 100644 libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.inc
diff --git a/libclc/clc/include/clc/subgroup/clc_sub_group_broadcast.h b/libclc/clc/include/clc/subgroup/clc_sub_group_broadcast.h
index d61f57860fe5b..e9fb566c54ef8 100644
--- a/libclc/clc/include/clc/subgroup/clc_sub_group_broadcast.h
+++ b/libclc/clc/include/clc/subgroup/clc_sub_group_broadcast.h
@@ -19,4 +19,6 @@
#define __CLC_BODY "clc/subgroup/clc_subgroup_broadcast.inc"
#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION
+
#endif // __CLC_SUBGROUP_CLC_SUB_GROUP_BROADCAST_H__
diff --git a/libclc/clc/include/clc/subgroup/clc_sub_group_scan.h b/libclc/clc/include/clc/subgroup/clc_sub_group_scan.h
new file mode 100644
index 0000000000000..a849ffa792758
--- /dev/null
+++ b/libclc/clc/include/clc/subgroup/clc_sub_group_scan.h
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_SUBGROUP_CLC_SUB_GROUP_SCAN_H__
+#define __CLC_SUBGROUP_CLC_SUB_GROUP_SCAN_H__
+
+#include "clc/internal/clc.h"
+
+#define __CLC_BODY "clc/subgroup/clc_sub_group_scan.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "clc/subgroup/clc_sub_group_scan.inc"
+#include "clc/math/gentype.inc"
+
+#endif // __CLC_SUBGROUP_CLC_SUB_GROUP_SCAN_H__
diff --git a/libclc/clc/include/clc/subgroup/clc_sub_group_scan.inc b/libclc/clc/include/clc/subgroup/clc_sub_group_scan.inc
new file mode 100644
index 0000000000000..924da27782f32
--- /dev/null
+++ b/libclc/clc/include/clc/subgroup/clc_sub_group_scan.inc
@@ -0,0 +1,27 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_scan_inclusive_add(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_scan_inclusive_min(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_scan_inclusive_max(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_scan_exclusive_add(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_scan_exclusive_min(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_scan_exclusive_max(__CLC_GENTYPE x);
+#endif
diff --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index ea79b2294d991..186b6ab0b85b1 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -30,6 +30,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
subgroup/clc_subgroup.cl
subgroup/clc_sub_group_broadcast.cl
subgroup/clc_sub_group_reduce.cl
+ subgroup/clc_sub_group_scan.cl
synchronization/clc_sub_group_barrier.cl
synchronization/clc_work_group_barrier.cl
workitem/clc_get_enqueued_local_size.cl
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc b/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
new file mode 100644
index 0000000000000..633d8b35534f7
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
@@ -0,0 +1,87 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__CLC_SCALAR)
+
+#if (defined(__CLC_GEN_S) && __CLC_GENSIZE == 32) || \
+ defined(__CLC_FPSIZE) && __CLC_FPSIZE == 32
+
+static _CLC_OVERLOAD __CLC_GENTYPE
+__clc_amdgpu_ds_swizzle_bcastx2_lane0(__CLC_GENTYPE x) {
+ return __CLC_AS_GENTYPE(
+ __clc_amdgpu_ds_swizzle_bcastx2_lane0(__clc_as_uint(x)));
+}
+
+static _CLC_OVERLOAD __CLC_GENTYPE
+__clc_amdgpu_ds_swizzle_bcastx4_lane1(__CLC_GENTYPE x) {
+ return __CLC_AS_GENTYPE(
+ __clc_amdgpu_ds_swizzle_bcastx4_lane1(__clc_as_uint(x)));
+}
+
+static _CLC_OVERLOAD __CLC_GENTYPE
+__clc_amdgpu_ds_swizzle_bcastx8_lane3(__CLC_GENTYPE x) {
+ return __CLC_AS_GENTYPE(
+ __clc_amdgpu_ds_swizzle_bcastx8_lane3(__clc_as_uint(x)));
+}
+
+static _CLC_OVERLOAD __CLC_GENTYPE
+__clc_amdgpu_ds_swizzle_bcastx16_lane7(__CLC_GENTYPE x) {
+ return __CLC_AS_GENTYPE(
+ __clc_amdgpu_ds_swizzle_bcastx16_lane7(__clc_as_uint(x)));
+}
+
+static _CLC_OVERLOAD __CLC_GENTYPE
+__clc_amdgpu_ds_swizzle_bcastx32_lane15(__CLC_GENTYPE x) {
+ return __CLC_AS_GENTYPE(
+ __clc_amdgpu_ds_swizzle_bcastx32_lane15(__clc_as_uint(x)));
+}
+
+_CLC_OVERLOAD static __CLC_GENTYPE
+__clc_amdgpu_ds_swizzle_quad_perm_shift_right1(__CLC_GENTYPE x) {
+ return __CLC_AS_GENTYPE(
+ __clc_amdgpu_ds_swizzle_quad_perm_shift_right1(__clc_as_uint(x)));
+}
+
+#elif defined(__CLC_GENSIZE) && __CLC_GENSIZE < 32 || \
+ defined(__CLC_FPSIZE) && __CLC_FPSIZE < 32
+
+#define PROMOTE_FUNC_UINT(func_name) \
+ static _CLC_OVERLOAD __CLC_GENTYPE func_name(__CLC_GENTYPE x) { \
+ return __CLC_AS_GENTYPE(__CLC_CONVERT_U_GENTYPE( \
+ func_name(__clc_convert_uint(__CLC_AS_U_GENTYPE(x))))); \
+ }
+
+PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx2_lane0)
+PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx4_lane1)
+PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx8_lane3)
+PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx16_lane7)
+PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx32_lane15)
+
+PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_quad_perm_shift_right1)
+
+#elif defined(__CLC_GENSIZE) && __CLC_GENSIZE == 64 || \
+ defined(__CLC_FPSIZE) && __CLC_FPSIZE == 64
+
+#define SPLIT_FUNC_64(func_name) \
+ static _CLC_OVERLOAD __CLC_GENTYPE func_name(__CLC_GENTYPE x) { \
+ uint2 vec = __clc_as_uint2(x); \
+ uint2 r = {func_name(vec.lo), func_name(vec.hi)}; \
+ return __CLC_AS_GENTYPE(r); \
+ }
+
+SPLIT_FUNC_64(__clc_amdgpu_ds_swizzle_bcastx2_lane0)
+SPLIT_FUNC_64(__clc_amdgpu_ds_swizzle_bcastx4_lane1)
+SPLIT_FUNC_64(__clc_amdgpu_ds_swizzle_bcastx8_lane3)
+SPLIT_FUNC_64(__clc_amdgpu_ds_swizzle_bcastx16_lane7)
+SPLIT_FUNC_64(__clc_amdgpu_ds_swizzle_bcastx32_lane15)
+
+SPLIT_FUNC_64(__clc_amdgpu_ds_swizzle_quad_perm_shift_right1)
+
+#endif
+
+#endif // __CLC_SCALAR
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
new file mode 100644
index 0000000000000..108139f02e4a9
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
@@ -0,0 +1,133 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/clc_convert.h"
+#include "clc/math/clc_fmax.h"
+#include "clc/math/clc_fmin.h"
+#include "clc/shared/clc_max.h"
+#include "clc/shared/clc_min.h"
+#include "clc/subgroup/clc_sub_group_broadcast.h"
+#include "clc/subgroup/clc_sub_group_scan.h"
+#include "clc/subgroup/clc_subgroup.h"
+
+#define QUAD_PERM (1 << 15)
+
+// The first basic swizzle mode (when offset[15] == 1) allows full data sharing
+// between a group of 4 consecutive threads.
+#define SWIZZLE_QUAD_PERM(S0, S1, S2, S3) \
+ (uint)(QUAD_PERM | (S3 << 6) | (S2 << 4) | (S1 << 2) | S0)
+
+#define SWIZZLE_PAIRWISE(XOR_MASK, OR_MASK, AND_MASK) \
+ (uint)((XOR_MASK << 10) | (OR_MASK << 5) | AND_MASK)
+
+#define SWIZZLE_BCASTX2_LANE0 SWIZZLE_PAIRWISE(0x00, 0x00, 0x1e)
+#define SWIZZLE_BCASTX4_LANE1 SWIZZLE_PAIRWISE(0x00, 0x01, 0x1c)
+#define SWIZZLE_BCASTX8_LANE3 SWIZZLE_PAIRWISE(0x00, 0x03, 0x18)
+#define SWIZZLE_BCASTX16_LANE7 SWIZZLE_PAIRWISE(0x00, 0x07, 0x10)
+#define SWIZZLE_BCASTX32_LANE15 SWIZZLE_PAIRWISE(0x00, 0x0f, 0x00)
+
+//------------------------------------------------------------------------------
+// Swizzle masks used in inclusive scan
+//------------------------------------------------------------------------------
+
+static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx2_lane0(uint x) {
+ return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX2_LANE0);
+}
+
+static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx4_lane1(uint x) {
+ return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX4_LANE1);
+}
+
+static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx8_lane3(uint x) {
+ return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX8_LANE3);
+}
+
+static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx16_lane7(uint x) {
+ return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX16_LANE7);
+}
+
+static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx32_lane15(uint x) {
+ return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX32_LANE15);
+}
+
+//------------------------------------------------------------------------------
+// Swizzle masks used in exclusive scan adjustment
+//------------------------------------------------------------------------------
+
+static _CLC_OVERLOAD uint
+__clc_amdgpu_ds_swizzle_quad_perm_shift_right1(uint x) {
+ return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_QUAD_PERM(0, 0, 1, 2));
+}
+
+#define __CLC_BODY "clc_amdgpu_ds_swizzle.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "clc_amdgpu_ds_swizzle.inc"
+#include "clc/math/gentype.inc"
+
+//------------------------------------------------------------------------------
+// Integer and fp add
+//------------------------------------------------------------------------------
+
+#define __CLC_FUNCTION_INCLUSIVE __clc_sub_group_scan_inclusive_add
+#define __CLC_FUNCTION_EXCLUSIVE __clc_sub_group_scan_exclusive_add
+#define __CLC_FUNCTION_IMPL(x, y) ((x) + (y))
+#define __CLC_SUBGROUP_SCAN_ID_VAL (__CLC_GENTYPE)0
+#define __CLC_BODY "clc_sub_group_scan.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "clc_sub_group_scan.inc"
+#include "clc/math/gentype.inc"
+
+#undef __CLC_FUNCTION_INCLUSIVE
+#undef __CLC_FUNCTION_EXCLUSIVE
+#undef __CLC_FUNCTION_IMPL
+#undef __CLC_SUBGROUP_SCAN_ID_VAL
+
+//------------------------------------------------------------------------------
+// Integer and fp min
+//------------------------------------------------------------------------------
+
+#define __CLC_FUNCTION_INCLUSIVE __clc_sub_group_scan_inclusive_min
+#define __CLC_FUNCTION_EXCLUSIVE __clc_sub_group_scan_exclusive_min
+#define __CLC_FUNCTION_IMPL(x, y) __clc_min(x, y)
+#define __CLC_SUBGROUP_SCAN_ID_VAL __CLC_GEN_MAX
+#define __CLC_BODY "clc_sub_group_scan.inc"
+#include "clc/integer/gentype.inc"
+#undef __CLC_FUNCTION_IMPL
+
+#define __CLC_FUNCTION_IMPL(x, y) __clc_fmin(x, y)
+#define __CLC_BODY "clc_sub_group_scan.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION_IMPL
+#undef __CLC_FUNCTION_INCLUSIVE
+#undef __CLC_FUNCTION_EXCLUSIVE
+#undef __CLC_SUBGROUP_SCAN_ID_VAL
+
+//------------------------------------------------------------------------------
+// Integer and fp max
+//------------------------------------------------------------------------------
+
+#define __CLC_FUNCTION_INCLUSIVE __clc_sub_group_scan_inclusive_max
+#define __CLC_FUNCTION_EXCLUSIVE __clc_sub_group_scan_exclusive_max
+#define __CLC_FUNCTION_IMPL(x, y) __clc_max(x, y)
+#define __CLC_SUBGROUP_SCAN_ID_VAL __CLC_GEN_MIN
+
+#define __CLC_BODY "clc_sub_group_scan.inc"
+#include "clc/integer/gentype.inc"
+#undef __CLC_FUNCTION_IMPL
+
+#define __CLC_FUNCTION_IMPL(x, y) __clc_fmax(x, y)
+#define __CLC_BODY "clc_sub_group_scan.inc"
+#include "clc/math/gentype.inc"
+#undef __CLC_FUNCTION_IMPL
+
+#undef __CLC_FUNCTION_INCLUSIVE
+#undef __CLC_FUNCTION_EXCLUSIVE
+#undef __CLC_FUNCTION_IMPL
+#undef __CLC_SUBGROUP_SCAN_ID_VAL
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.inc b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.inc
new file mode 100644
index 0000000000000..af0b3a30a0bfd
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.inc
@@ -0,0 +1,83 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__CLC_SCALAR)
+
+#if defined(__CLC_GEN_S)
+#define __CLC_GEN_MAX (__CLC_GENTYPE)((1LL << (__CLC_GENSIZE - 1)) - 1LL)
+#define __CLC_GEN_MIN (__CLC_GENTYPE)(-(1LL << (__CLC_GENSIZE - 1)))
+#elif defined(__CLC_GEN_U)
+#define __CLC_GEN_MAX (__CLC_GENTYPE)((1ull << __CLC_GENSIZE) - 1ull)
+#define __CLC_GEN_MIN (__CLC_GENTYPE)0
+#elif defined(__CLC_FPSIZE)
+#define __CLC_GEN_MIN -INFINITY
+#define __CLC_GEN_MAX INFINITY
+#endif
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __CLC_FUNCTION_INCLUSIVE(__CLC_GENTYPE x) {
+ uint l = __clc_get_sub_group_local_id();
+
+ __CLC_GENTYPE v = __clc_amdgpu_ds_swizzle_bcastx2_lane0(x);
+ v = (l & 1) ? v : __CLC_SUBGROUP_SCAN_ID_VAL;
+ __CLC_GENTYPE s = __CLC_FUNCTION_IMPL(x, v);
+
+ v = __clc_amdgpu_ds_swizzle_bcastx4_lane1(s);
+ v = (l & 2) ? v : __CLC_SUBGROUP_SCAN_ID_VAL;
+ s = __CLC_FUNCTION_IMPL(s, v);
+
+ v = __clc_amdgpu_ds_swizzle_bcastx8_lane3(s);
+ v = (l & 4) ? v : __CLC_SUBGROUP_SCAN_ID_VAL;
+ s = __CLC_FUNCTION_IMPL(s, v);
+
+ v = __clc_amdgpu_ds_swizzle_bcastx16_lane7(s);
+ v = (l & 8) ? v : __CLC_SUBGROUP_SCAN_ID_VAL;
+ s = __CLC_FUNCTION_IMPL(s, v);
+
+ v = __clc_amdgpu_ds_swizzle_bcastx32_lane15(s);
+ v = (l & 16) ? v : __CLC_SUBGROUP_SCAN_ID_VAL;
+ s = __CLC_FUNCTION_IMPL(s, v);
+
+ if (__builtin_amdgcn_wavefrontsize() == 64) {
+ v = __clc_sub_group_broadcast(s, 31);
+ v = l > 31 ? v : __CLC_SUBGROUP_SCAN_ID_VAL;
+ s = __CLC_FUNCTION_IMPL(s, v);
+ }
+
+ return s;
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE __CLC_FUNCTION_EXCLUSIVE(__CLC_GENTYPE x) {
+ __CLC_GENTYPE s = __CLC_FUNCTION_INCLUSIVE(x);
+ __CLC_GENTYPE t = s;
+
+ s = __clc_amdgpu_ds_swizzle_quad_perm_shift_right1(t);
+
+ __CLC_GENTYPE v = __clc_amdgpu_ds_swizzle_bcastx8_lane3(t);
+
+ uint l = __clc_get_sub_group_local_id();
+
+ s = ((l & 0x7) == 0x4) ? v : s;
+
+ v = __clc_amdgpu_ds_swizzle_bcastx16_lane7(t);
+ s = ((l & 0xf) == 0x8) ? v : s;
+
+ v = __clc_amdgpu_ds_swizzle_bcastx32_lane15(t);
+ s = ((l & 0x1f) == 0x10) ? v : s;
+
+ if (__builtin_amdgcn_wavefrontsize() == 64) {
+ v = __clc_sub_group_broadcast(t, 31);
+ s = (l == 32) ? v : s;
+ }
+
+ return (l == 0) ? __CLC_SUBGROUP_SCAN_ID_VAL : s;
+}
+
+#undef __CLC_GEN_MIN
+#undef __CLC_GEN_MAX
+
+#endif // __CLC_SCALAR
diff --git a/libclc/opencl/lib/generic/CMakeLists.txt b/libclc/opencl/lib/generic/CMakeLists.txt
index f30af80e9c65e..1b8beb57e34de 100644
--- a/libclc/opencl/lib/generic/CMakeLists.txt
+++ b/libclc/opencl/lib/generic/CMakeLists.txt
@@ -207,6 +207,8 @@ libclc_configure_source_list(OPENCL_GENERIC_SOURCES
shared/vstore.cl
subgroup/sub_group_broadcast.cl
subgroup/sub_group_reduce.cl
+ subgroup/sub_group_scan_exclusive.cl
+ subgroup/sub_group_scan_inclusive.cl
subgroup/subgroup.cl
synchronization/sub_group_barrier.cl
synchronization/work_group_barrier.cl
diff --git a/libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.cl b/libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.cl
new file mode 100644
index 0000000000000..8825a838f3392
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/subgroup/clc_sub_group_scan.h"
+
+#define __CLC_BODY "sub_group_scan_exclusive.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "sub_group_scan_exclusive.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.inc b/libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.inc
new file mode 100644
index 0000000000000..243637d8d824f
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_scan_exclusive.inc
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__CLC_SCALAR) && \
+ ((defined(__CLC_FPSIZE) || __CLC_GENSIZE == 32 || __CLC_GENSIZE == 64) || \
+ defined(cl_khr_subgroup_extended_types))
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE
+sub_group_scan_exclusive_add(__CLC_GENTYPE x) {
+ return __clc_sub_group_scan_exclusive_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE
+sub_group_scan_exclusive_min(__CLC_GENTYPE x) {
+ return __clc_sub_group_scan_exclusive_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE
+sub_group_scan_exclusive_max(__CLC_GENTYPE x) {
+ return __clc_sub_group_scan_exclusive_max(x);
+}
+
+#endif
diff --git a/libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.cl b/libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.cl
new file mode 100644
index 0000000000000..15e8120443c61
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.cl
@@ -0,0 +1,15 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/subgroup/clc_sub_group_scan.h"
+
+#define __CLC_BODY "sub_group_scan_inclusive.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "sub_group_scan_inclusive.inc"
+#include "clc/math/gentype.inc"
diff --git a/libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.inc b/libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.inc
new file mode 100644
index 0000000000000..7f0ad6d01b153
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_scan_inclusive.inc
@@ -0,0 +1,28 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__CLC_SCALAR) && \
+ ((defined(__CLC_FPSIZE) || __CLC_GENSIZE == 32 || __CLC_GENSIZE == 64) || \
+ defined(cl_khr_subgroup_extended_types))
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE
+sub_group_scan_inclusive_add(__CLC_GENTYPE x) {
+ return __clc_sub_group_scan_inclusive_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE
+sub_group_scan_inclusive_min(__CLC_GENTYPE x) {
+ return __clc_sub_group_scan_inclusive_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD __CLC_GENTYPE
+sub_group_scan_inclusive_max(__CLC_GENTYPE x) {
+ return __clc_sub_group_scan_inclusive_max(x);
+}
+
+#endif
>From 939f4f7495509a6cfd51894805ff157bbd3fbbeb Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Wed, 25 Mar 2026 22:28:15 +0100
Subject: [PATCH 2/3] Shrink ds_swizzle wrappers
---
.../amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc | 66 ++++++++-----------
.../lib/amdgpu/subgroup/clc_sub_group_scan.cl | 33 ----------
2 files changed, 28 insertions(+), 71 deletions(-)
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc b/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
index 633d8b35534f7..394b1025ace7b 100644
--- a/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
@@ -8,44 +8,34 @@
#if defined(__CLC_SCALAR)
-#if (defined(__CLC_GEN_S) && __CLC_GENSIZE == 32) || \
- defined(__CLC_FPSIZE) && __CLC_FPSIZE == 32
-
-static _CLC_OVERLOAD __CLC_GENTYPE
-__clc_amdgpu_ds_swizzle_bcastx2_lane0(__CLC_GENTYPE x) {
- return __CLC_AS_GENTYPE(
- __clc_amdgpu_ds_swizzle_bcastx2_lane0(__clc_as_uint(x)));
-}
-
-static _CLC_OVERLOAD __CLC_GENTYPE
-__clc_amdgpu_ds_swizzle_bcastx4_lane1(__CLC_GENTYPE x) {
- return __CLC_AS_GENTYPE(
- __clc_amdgpu_ds_swizzle_bcastx4_lane1(__clc_as_uint(x)));
-}
-
-static _CLC_OVERLOAD __CLC_GENTYPE
-__clc_amdgpu_ds_swizzle_bcastx8_lane3(__CLC_GENTYPE x) {
- return __CLC_AS_GENTYPE(
- __clc_amdgpu_ds_swizzle_bcastx8_lane3(__clc_as_uint(x)));
-}
-
-static _CLC_OVERLOAD __CLC_GENTYPE
-__clc_amdgpu_ds_swizzle_bcastx16_lane7(__CLC_GENTYPE x) {
- return __CLC_AS_GENTYPE(
- __clc_amdgpu_ds_swizzle_bcastx16_lane7(__clc_as_uint(x)));
-}
-
-static _CLC_OVERLOAD __CLC_GENTYPE
-__clc_amdgpu_ds_swizzle_bcastx32_lane15(__CLC_GENTYPE x) {
- return __CLC_AS_GENTYPE(
- __clc_amdgpu_ds_swizzle_bcastx32_lane15(__clc_as_uint(x)));
-}
-
-_CLC_OVERLOAD static __CLC_GENTYPE
-__clc_amdgpu_ds_swizzle_quad_perm_shift_right1(__CLC_GENTYPE x) {
- return __CLC_AS_GENTYPE(
- __clc_amdgpu_ds_swizzle_quad_perm_shift_right1(__clc_as_uint(x)));
-}
+#if (defined(__CLC_GENSIZE) && __CLC_GENSIZE <= 32) || \
+ defined(__CLC_FPSIZE) && __CLC_FPSIZE <= 32
+
+#define COERCE_FUNC(func, mask) \
+ static _CLC_OVERLOAD __CLC_GENTYPE func(__CLC_GENTYPE x) { \
+ __CLC_U_GENTYPE bitcast = __CLC_AS_U_GENTYPE(x); \
+ uint ext = __clc_convert_uint(bitcast); \
+ uint swizzle = __builtin_amdgcn_ds_swizzle(ext, mask); \
+ __CLC_U_GENTYPE trunc = __CLC_CONVERT_U_GENTYPE(swizzle); \
+ return __CLC_AS_GENTYPE(trunc); \
+ }
+
+//------------------------------------------------------------------------------
+// Swizzle masks used in inclusive scan
+//------------------------------------------------------------------------------
+
+COERCE_FUNC(__clc_amdgpu_ds_swizzle_bcastx2_lane0, SWIZZLE_BCASTX2_LANE0);
+COERCE_FUNC(__clc_amdgpu_ds_swizzle_bcastx4_lane1, SWIZZLE_BCASTX4_LANE1);
+COERCE_FUNC(__clc_amdgpu_ds_swizzle_bcastx8_lane3, SWIZZLE_BCASTX8_LANE3);
+COERCE_FUNC(__clc_amdgpu_ds_swizzle_bcastx16_lane7, SWIZZLE_BCASTX16_LANE7);
+COERCE_FUNC(__clc_amdgpu_ds_swizzle_bcastx32_lane15, SWIZZLE_BCASTX32_LANE15);
+
+//------------------------------------------------------------------------------
+// Swizzle masks used in exclusive scan adjustment
+//------------------------------------------------------------------------------
+
+COERCE_FUNC(__clc_amdgpu_ds_swizzle_quad_perm_shift_right1,
+ SWIZZLE_QUAD_PERM(0, 0, 1, 2));
#elif defined(__CLC_GENSIZE) && __CLC_GENSIZE < 32 || \
defined(__CLC_FPSIZE) && __CLC_FPSIZE < 32
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
index 108139f02e4a9..ef5209eed8374 100644
--- a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
@@ -31,39 +31,6 @@
#define SWIZZLE_BCASTX16_LANE7 SWIZZLE_PAIRWISE(0x00, 0x07, 0x10)
#define SWIZZLE_BCASTX32_LANE15 SWIZZLE_PAIRWISE(0x00, 0x0f, 0x00)
-//------------------------------------------------------------------------------
-// Swizzle masks used in inclusive scan
-//------------------------------------------------------------------------------
-
-static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx2_lane0(uint x) {
- return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX2_LANE0);
-}
-
-static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx4_lane1(uint x) {
- return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX4_LANE1);
-}
-
-static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx8_lane3(uint x) {
- return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX8_LANE3);
-}
-
-static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx16_lane7(uint x) {
- return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX16_LANE7);
-}
-
-static _CLC_OVERLOAD uint __clc_amdgpu_ds_swizzle_bcastx32_lane15(uint x) {
- return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_BCASTX32_LANE15);
-}
-
-//------------------------------------------------------------------------------
-// Swizzle masks used in exclusive scan adjustment
-//------------------------------------------------------------------------------
-
-static _CLC_OVERLOAD uint
-__clc_amdgpu_ds_swizzle_quad_perm_shift_right1(uint x) {
- return __builtin_amdgcn_ds_swizzle(x, SWIZZLE_QUAD_PERM(0, 0, 1, 2));
-}
-
#define __CLC_BODY "clc_amdgpu_ds_swizzle.inc"
#include "clc/integer/gentype.inc"
>From 2d16055dcfe2425c7fce3bafb195ac4cc4d7cec5 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault at amd.com>
Date: Fri, 27 Mar 2026 09:17:35 +0100
Subject: [PATCH 3/3] Address comments
---
.../amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc | 17 -----------------
.../lib/amdgpu/subgroup/clc_sub_group_scan.cl | 6 ------
2 files changed, 23 deletions(-)
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc b/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
index 394b1025ace7b..2455b0698aae1 100644
--- a/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_amdgpu_ds_swizzle.inc
@@ -37,23 +37,6 @@ COERCE_FUNC(__clc_amdgpu_ds_swizzle_bcastx32_lane15, SWIZZLE_BCASTX32_LANE15);
COERCE_FUNC(__clc_amdgpu_ds_swizzle_quad_perm_shift_right1,
SWIZZLE_QUAD_PERM(0, 0, 1, 2));
-#elif defined(__CLC_GENSIZE) && __CLC_GENSIZE < 32 || \
- defined(__CLC_FPSIZE) && __CLC_FPSIZE < 32
-
-#define PROMOTE_FUNC_UINT(func_name) \
- static _CLC_OVERLOAD __CLC_GENTYPE func_name(__CLC_GENTYPE x) { \
- return __CLC_AS_GENTYPE(__CLC_CONVERT_U_GENTYPE( \
- func_name(__clc_convert_uint(__CLC_AS_U_GENTYPE(x))))); \
- }
-
-PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx2_lane0)
-PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx4_lane1)
-PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx8_lane3)
-PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx16_lane7)
-PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_bcastx32_lane15)
-
-PROMOTE_FUNC_UINT(__clc_amdgpu_ds_swizzle_quad_perm_shift_right1)
-
#elif defined(__CLC_GENSIZE) && __CLC_GENSIZE == 64 || \
defined(__CLC_FPSIZE) && __CLC_FPSIZE == 64
diff --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
index ef5209eed8374..3ef735aac2aae 100644
--- a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_scan.cl
@@ -66,9 +66,7 @@
#define __CLC_SUBGROUP_SCAN_ID_VAL __CLC_GEN_MAX
#define __CLC_BODY "clc_sub_group_scan.inc"
#include "clc/integer/gentype.inc"
-#undef __CLC_FUNCTION_IMPL
-#define __CLC_FUNCTION_IMPL(x, y) __clc_fmin(x, y)
#define __CLC_BODY "clc_sub_group_scan.inc"
#include "clc/math/gentype.inc"
#undef __CLC_FUNCTION_IMPL
@@ -87,14 +85,10 @@
#define __CLC_BODY "clc_sub_group_scan.inc"
#include "clc/integer/gentype.inc"
-#undef __CLC_FUNCTION_IMPL
-#define __CLC_FUNCTION_IMPL(x, y) __clc_fmax(x, y)
#define __CLC_BODY "clc_sub_group_scan.inc"
#include "clc/math/gentype.inc"
#undef __CLC_FUNCTION_IMPL
-
#undef __CLC_FUNCTION_INCLUSIVE
#undef __CLC_FUNCTION_EXCLUSIVE
-#undef __CLC_FUNCTION_IMPL
#undef __CLC_SUBGROUP_SCAN_ID_VAL
More information about the cfe-commits
mailing list