[libclc] 35781a7 - libclc: Partially implement nonuniform subgroup reduce functions (#188929)

via cfe-commits cfe-commits at lists.llvm.org
Fri Mar 27 02:47:49 PDT 2026


Author: Matt Arsenault
Date: 2026-03-27T09:47:44Z
New Revision: 35781a7d43fbd61cf18fd63923df95cb1d2a861c

URL: https://github.com/llvm/llvm-project/commit/35781a7d43fbd61cf18fd63923df95cb1d2a861c
DIFF: https://github.com/llvm/llvm-project/commit/35781a7d43fbd61cf18fd63923df95cb1d2a861c.diff

LOG: libclc: Partially implement nonuniform subgroup reduce functions (#188929)

For AMDGPU these are identical to the uniform case. Stub out the missing
cases with traps to avoid test failures from undefined symbols while
keeping the structure consistent.

Added: 
    libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h
    libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce_decl.inc
    libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl
    libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc
    libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl
    libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc

Modified: 
    libclc/clc/lib/amdgpu/CMakeLists.txt
    libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl
    libclc/opencl/lib/generic/CMakeLists.txt

Removed: 
    


################################################################################
diff  --git a/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h
new file mode 100644
index 0000000000000..bb8ff12030809
--- /dev/null
+++ b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce.h
@@ -0,0 +1,29 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__
+#define __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__
+
+#include "clc/internal/clc.h"
+
+#define __CLC_BODY "clc/subgroup/clc_sub_group_non_uniform_reduce_decl.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "clc/subgroup/clc_sub_group_non_uniform_reduce_decl.inc"
+#include "clc/math/gentype.inc"
+
+_CLC_DECL _CLC_OVERLOAD int
+__clc_sub_group_non_uniform_reduce_logical_and(int x);
+
+_CLC_DECL _CLC_OVERLOAD int
+__clc_sub_group_non_uniform_reduce_logical_or(int x);
+
+_CLC_DECL _CLC_OVERLOAD int
+__clc_sub_group_non_uniform_reduce_logical_xor(int x);
+
+#endif // __CLC_SUBGROUP_CLC_SUB_GROUP_NON_UNIFORM_REDUCE_H__

diff  --git a/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce_decl.inc b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce_decl.inc
new file mode 100644
index 0000000000000..f7b82d3119ed9
--- /dev/null
+++ b/libclc/clc/include/clc/subgroup/clc_sub_group_non_uniform_reduce_decl.inc
@@ -0,0 +1,33 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_add(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_mul(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_min(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_max(__CLC_GENTYPE x);
+
+#ifndef __CLC_FPSIZE
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_and(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_or(__CLC_GENTYPE x);
+
+_CLC_DECL _CLC_OVERLOAD __CLC_GENTYPE
+__clc_sub_group_non_uniform_reduce_xor(__CLC_GENTYPE x);
+#endif // __CLC_FPSIZE
+
+#endif // __CLC_SCALAR

diff  --git a/libclc/clc/lib/amdgpu/CMakeLists.txt b/libclc/clc/lib/amdgpu/CMakeLists.txt
index 186b6ab0b85b1..a5cd47fab4462 100644
--- a/libclc/clc/lib/amdgpu/CMakeLists.txt
+++ b/libclc/clc/lib/amdgpu/CMakeLists.txt
@@ -29,6 +29,7 @@ libclc_configure_source_list(CLC_AMDGPU_SOURCES
   mem_fence/clc_mem_fence.cl
   subgroup/clc_subgroup.cl
   subgroup/clc_sub_group_broadcast.cl
+  subgroup/clc_sub_group_non_uniform_reduce.cl
   subgroup/clc_sub_group_reduce.cl
   subgroup/clc_sub_group_scan.cl
   synchronization/clc_sub_group_barrier.cl

diff  --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl
new file mode 100644
index 0000000000000..00a485e9405a3
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_non_uniform_reduce.cl
@@ -0,0 +1,384 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h"
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_add(uint x) {
+  return __builtin_amdgcn_wave_reduce_add_u32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_add(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_add((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_add(ulong x) {
+  return __builtin_amdgcn_wave_reduce_add_u64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_add(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_add((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_min(uint x) {
+  return __builtin_amdgcn_wave_reduce_min_u32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_min(int x) {
+  return __builtin_amdgcn_wave_reduce_min_i32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_min(ulong x) {
+  return __builtin_amdgcn_wave_reduce_min_u64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_min(long x) {
+  return __builtin_amdgcn_wave_reduce_min_i64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_max(uint x) {
+  return __builtin_amdgcn_wave_reduce_max_u32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_max(int x) {
+  return __builtin_amdgcn_wave_reduce_max_i32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_max(ulong x) {
+  return __builtin_amdgcn_wave_reduce_max_u64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_max(long x) {
+  return __builtin_amdgcn_wave_reduce_max_i64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_add(float x) {
+  return __builtin_amdgcn_wave_reduce_fadd_f32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_add(double x) {
+  return __builtin_amdgcn_wave_reduce_fadd_f64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_min(float x) {
+  return __builtin_amdgcn_wave_reduce_fmin_f32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_min(double x) {
+  return __builtin_amdgcn_wave_reduce_fmin_f64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_max(float x) {
+  return __builtin_amdgcn_wave_reduce_fmax_f32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_max(double x) {
+  return __builtin_amdgcn_wave_reduce_fmax_f64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_add(half x) {
+  // FIXME: There should be a direct half builtin available.
+  return (float)__clc_sub_group_non_uniform_reduce_add((float)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_min(half x) {
+  // FIXME: There should be a direct half builtin available.
+  return (float)__clc_sub_group_non_uniform_reduce_min((float)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_max(half x) {
+  // FIXME: There should be a direct half builtin available.
+  return (float)__clc_sub_group_non_uniform_reduce_max((float)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_add(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_add((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_add(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_add((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_add(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_add((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_add(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_add((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_min(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_min((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_min(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_min((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_min(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_min((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_min(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_min((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_max(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_max((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_max(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_max((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_max(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_max((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_max(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_max((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_and(uint x) {
+  return __builtin_amdgcn_wave_reduce_and_b32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_and(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_and((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_and(ulong x) {
+  return __builtin_amdgcn_wave_reduce_and_b64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_and(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_and((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_or(uint x) {
+  return __builtin_amdgcn_wave_reduce_or_b32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_or(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_or((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_or(ulong x) {
+  return __builtin_amdgcn_wave_reduce_or_b64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_or(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_or((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_xor(uint x) {
+  return __builtin_amdgcn_wave_reduce_xor_b32(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_xor(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_xor((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_xor(ulong x) {
+  return __builtin_amdgcn_wave_reduce_xor_b64(x, 0);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_xor(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_xor((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_and(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_and((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_and(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_and((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_and(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_and((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_and(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_and((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_or(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_or((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_or(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_or((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_or(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_or((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_or(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_or((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_xor(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_xor((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_xor(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_xor((int)x);
+}
+
+// FIXME: There should be a direct short builtin available.
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_xor(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_xor((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_xor(short x) {
+  return (int)__clc_sub_group_non_uniform_reduce_xor((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint
+__clc_sub_group_non_uniform_reduce_mul(uint x) {
+  (void)x;
+  // TODO:
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_mul(int x) {
+  return (int)__clc_sub_group_non_uniform_reduce_mul((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong
+__clc_sub_group_non_uniform_reduce_mul(ulong x) {
+  (void)x;
+  // TODO:
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST long
+__clc_sub_group_non_uniform_reduce_mul(long x) {
+  return (long)__clc_sub_group_non_uniform_reduce_mul((ulong)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST char
+__clc_sub_group_non_uniform_reduce_mul(char x) {
+  return (char)__clc_sub_group_non_uniform_reduce_mul((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar
+__clc_sub_group_non_uniform_reduce_mul(uchar x) {
+  return (uchar)__clc_sub_group_non_uniform_reduce_mul((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST short
+__clc_sub_group_non_uniform_reduce_mul(short x) {
+  return (short)__clc_sub_group_non_uniform_reduce_mul((int)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort
+__clc_sub_group_non_uniform_reduce_mul(ushort x) {
+  return (ushort)__clc_sub_group_non_uniform_reduce_mul((uint)x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_logical_and(int predicate) {
+  // TODO:
+  (void)predicate;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_logical_or(int predicate) {
+  // TODO:
+  (void)predicate;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST int
+__clc_sub_group_non_uniform_reduce_logical_xor(int predicate) {
+  // TODO:
+  (void)predicate;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST float
+__clc_sub_group_non_uniform_reduce_mul(float x) {
+  (void)x;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST double
+__clc_sub_group_non_uniform_reduce_mul(double x) {
+  (void)x;
+  __builtin_trap();
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST half
+__clc_sub_group_non_uniform_reduce_mul(half x) {
+  (void)x;
+  __builtin_trap();
+}

diff  --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl
index 66d0130839d38..c74fe96459800 100644
--- a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.cl
@@ -6,140 +6,14 @@
 //
 //===----------------------------------------------------------------------===//
 
-#include "clc/subgroup/clc_sub_group_broadcast.h"
+#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h"
+#include "clc/subgroup/clc_sub_group_reduce.h"
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_add(uint x) {
-  return __builtin_amdgcn_wave_reduce_add_u32(x, 0);
-}
+// The implementation is the same as the nonuniform case, so just call the
+// nonuniform versions of every function.
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_add(int x) {
-  return (int)__clc_sub_group_reduce_add((uint)x);
-}
+#define __CLC_BODY "clc_sub_group_reduce.inc"
+#include "clc/integer/gentype.inc"
 
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_add(ulong x) {
-  return __builtin_amdgcn_wave_reduce_add_u64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_add(long x) {
-  return (long)__clc_sub_group_reduce_add((ulong)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_min(uint x) {
-  return __builtin_amdgcn_wave_reduce_min_u32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_min(int x) {
-  return __builtin_amdgcn_wave_reduce_min_i32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_min(ulong x) {
-  return __builtin_amdgcn_wave_reduce_min_u64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_min(long x) {
-  return __builtin_amdgcn_wave_reduce_min_i64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uint __clc_sub_group_reduce_max(uint x) {
-  return __builtin_amdgcn_wave_reduce_max_u32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST int __clc_sub_group_reduce_max(int x) {
-  return __builtin_amdgcn_wave_reduce_max_i32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ulong __clc_sub_group_reduce_max(ulong x) {
-  return __builtin_amdgcn_wave_reduce_max_u64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST long __clc_sub_group_reduce_max(long x) {
-  return __builtin_amdgcn_wave_reduce_max_i64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST float __clc_sub_group_reduce_add(float x) {
-  return __builtin_amdgcn_wave_reduce_fadd_f32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST double __clc_sub_group_reduce_add(double x) {
-  return __builtin_amdgcn_wave_reduce_fadd_f64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST float __clc_sub_group_reduce_min(float x) {
-  return __builtin_amdgcn_wave_reduce_fmin_f32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST double __clc_sub_group_reduce_min(double x) {
-  return __builtin_amdgcn_wave_reduce_fmin_f64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST float __clc_sub_group_reduce_max(float x) {
-  return __builtin_amdgcn_wave_reduce_fmax_f32(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST double __clc_sub_group_reduce_max(double x) {
-  return __builtin_amdgcn_wave_reduce_fmax_f64(x, 0);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST half __clc_sub_group_reduce_add(half x) {
-  // FIXME: There should be a direct half builtin available.
-  return (float)__clc_sub_group_reduce_add((float)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST half __clc_sub_group_reduce_min(half x) {
-  // FIXME: There should be a direct half builtin available.
-  return (float)__clc_sub_group_reduce_min((float)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST half __clc_sub_group_reduce_max(half x) {
-  // FIXME: There should be a direct half builtin available.
-  return (float)__clc_sub_group_reduce_max((float)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar __clc_sub_group_reduce_add(uchar x) {
-  return (uchar)__clc_sub_group_reduce_add((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST char __clc_sub_group_reduce_add(char x) {
-  return (char)__clc_sub_group_reduce_add((int)x);
-}
-
-// FIXME: There should be a direct short builtin available.
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort __clc_sub_group_reduce_add(ushort x) {
-  return (ushort)__clc_sub_group_reduce_add((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST short __clc_sub_group_reduce_add(short x) {
-  return (int)__clc_sub_group_reduce_add((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar __clc_sub_group_reduce_min(uchar x) {
-  return (uchar)__clc_sub_group_reduce_min((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST char __clc_sub_group_reduce_min(char x) {
-  return (char)__clc_sub_group_reduce_min((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort __clc_sub_group_reduce_min(ushort x) {
-  return (ushort)__clc_sub_group_reduce_min((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST short __clc_sub_group_reduce_min(short x) {
-  return (int)__clc_sub_group_reduce_min((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST uchar __clc_sub_group_reduce_max(uchar x) {
-  return (uchar)__clc_sub_group_reduce_max((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST char __clc_sub_group_reduce_max(char x) {
-  return (char)__clc_sub_group_reduce_max((int)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST ushort __clc_sub_group_reduce_max(ushort x) {
-  return (ushort)__clc_sub_group_reduce_max((uint)x);
-}
-
-_CLC_DEF _CLC_OVERLOAD _CLC_CONST short __clc_sub_group_reduce_max(short x) {
-  return (int)__clc_sub_group_reduce_max((int)x);
-}
+#define __CLC_BODY "clc_sub_group_reduce.inc"
+#include "clc/math/gentype.inc"

diff  --git a/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc
new file mode 100644
index 0000000000000..1d487fdd7e3e9
--- /dev/null
+++ b/libclc/clc/lib/amdgpu/subgroup/clc_sub_group_reduce.inc
@@ -0,0 +1,26 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_sub_group_reduce_add(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_add(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_sub_group_reduce_min(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_min(x);
+}
+
+_CLC_DEF _CLC_OVERLOAD _CLC_CONST __CLC_GENTYPE
+__clc_sub_group_reduce_max(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_max(x);
+}
+
+#endif

diff  --git a/libclc/opencl/lib/generic/CMakeLists.txt b/libclc/opencl/lib/generic/CMakeLists.txt
index e6565c7f9ed67..4ad60248139ae 100644
--- a/libclc/opencl/lib/generic/CMakeLists.txt
+++ b/libclc/opencl/lib/generic/CMakeLists.txt
@@ -208,6 +208,7 @@ libclc_configure_source_list(OPENCL_GENERIC_SOURCES
   shared/vstore.cl
   subgroup/sub_group_broadcast.cl
   subgroup/sub_group_reduce.cl
+  subgroup/sub_group_non_uniform_reduce.cl
   subgroup/sub_group_scan_exclusive.cl
   subgroup/sub_group_scan_inclusive.cl
   subgroup/subgroup.cl

diff  --git a/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl
new file mode 100644
index 0000000000000..e00717b979ea1
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.cl
@@ -0,0 +1,30 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clc/subgroup/clc_sub_group_non_uniform_reduce.h"
+
+#define __CLC_BODY "sub_group_non_uniform_reduce.inc"
+#include "clc/integer/gentype.inc"
+
+#define __CLC_BODY "sub_group_non_uniform_reduce.inc"
+#include "clc/math/gentype.inc"
+
+_CLC_DEF _CLC_OVERLOAD int
+sub_group_non_uniform_reduce_logical_and(int predicate) {
+  return __clc_sub_group_non_uniform_reduce_logical_and(predicate);
+}
+
+_CLC_DEF _CLC_OVERLOAD int
+sub_group_non_uniform_reduce_logical_or(int predicate) {
+  return __clc_sub_group_non_uniform_reduce_logical_or(predicate);
+}
+
+_CLC_DEF _CLC_OVERLOAD int
+sub_group_non_uniform_reduce_logical_xor(int predicate) {
+  return __clc_sub_group_non_uniform_reduce_logical_xor(predicate);
+}

diff  --git a/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc
new file mode 100644
index 0000000000000..4e5c8dcd8523f
--- /dev/null
+++ b/libclc/opencl/lib/generic/subgroup/sub_group_non_uniform_reduce.inc
@@ -0,0 +1,48 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __CLC_SCALAR
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_add(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_add(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_mul(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_mul(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_min(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_min(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_max(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_max(x);
+}
+
+#ifndef __CLC_FPSIZE
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_and(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_and(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_or(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_or(x);
+}
+
+_CLC_OVERLOAD _CLC_DEF _CLC_CONST __CLC_GENTYPE
+sub_group_non_uniform_reduce_xor(__CLC_GENTYPE x) {
+  return __clc_sub_group_non_uniform_reduce_xor(x);
+}
+#endif
+
+#endif // __CLC_SCALAR


        


More information about the cfe-commits mailing list