[libclc] [libclc] Refine generic __clc_get_sub_group_size with fast full sub-group path (PR #188895)
Wenju He via cfe-commits
cfe-commits at lists.llvm.org
Thu Mar 26 19:15:59 PDT 2026
https://github.com/wenju-he created https://github.com/llvm/llvm-project/pull/188895
Add a fast path for the common case that total work-group size is multiple of max sub-group size.
The fallback path is ported from amdgpu/workitem/clc_get_sub_group_size.cl.
Compiler can generate predicated instructions for the fallback path to avoid branches.
>From 7fc520b6e762e13854722f1aa45188098ecc2587 Mon Sep 17 00:00:00 2001
From: Wenju He <wenju.he at intel.com>
Date: Fri, 27 Mar 2026 03:00:59 +0100
Subject: [PATCH] [libclc] Refine generic __clc_get_sub_group_size with fast
full sub-group path
Add a fast path for the common case that total work-group size is
multiple of max sub-group size.
The fallback path is ported from amdgpu/workitem/clc_get_sub_group_size.cl.
Compiler can generate predicated instructions for the fallback path to
avoid branches.
---
.../workitem/clc_get_sub_group_size.cl | 24 +++++++++----------
1 file changed, 12 insertions(+), 12 deletions(-)
diff --git a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl
index 7944486aac0f0..7f96fc8c31717 100644
--- a/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl
+++ b/libclc/clc/lib/generic/workitem/clc_get_sub_group_size.cl
@@ -6,21 +6,21 @@
//
//===----------------------------------------------------------------------===//
+#include "clc/shared/clc_min.h"
+#include "clc/workitem/clc_get_local_linear_id.h"
#include "clc/workitem/clc_get_local_size.h"
#include "clc/workitem/clc_get_max_sub_group_size.h"
-#include "clc/workitem/clc_get_num_sub_groups.h"
-#include "clc/workitem/clc_get_sub_group_id.h"
#include "clc/workitem/clc_get_sub_group_size.h"
_CLC_OVERLOAD _CLC_DEF uint __clc_get_sub_group_size() {
- if (__clc_get_sub_group_id() != __clc_get_num_sub_groups() - 1) {
- return __clc_get_max_sub_group_size();
- }
- size_t size_x = __clc_get_local_size(0);
- size_t size_y = __clc_get_local_size(1);
- size_t size_z = __clc_get_local_size(2);
- size_t linear_size = size_z * size_y * size_x;
- size_t uniform_groups = __clc_get_num_sub_groups() - 1;
- size_t uniform_size = __clc_get_max_sub_group_size() * uniform_groups;
- return linear_size - uniform_size;
+ uint local_linear_size = (uint)__clc_get_local_size(0) *
+ (uint)__clc_get_local_size(1) *
+ (uint)__clc_get_local_size(2);
+ uint max_sg_size = __clc_get_max_sub_group_size();
+ // Assume max_sg_size is power of 2.
+ uint remainder = local_linear_size & (max_sg_size - 1);
+ if (remainder == 0)
+ return max_sg_size;
+ uint lid = (uint)__clc_get_local_linear_id();
+ return __clc_min(max_sg_size, local_linear_size - (lid & ~(max_sg_size - 1)));
}
More information about the cfe-commits
mailing list