https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/161816
>From 62867d1bcdb3d8d0eba2b04a78f61f98b92e7de6 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Tue, 30 Sep 2025 11:37:42 +0530
Subject: [PATCH] [AMDGPU] Add builtins for wave reduction intrinsics
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 4 +
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 8 ++
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 84 ++++++++++++++++++++
3 files changed, 96 insertions(+)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fda16e42d2c6b..ebc0ac35f42d9 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_f32, "ffZi", "nc")
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 07cf08c54985a..4de722077c8e9 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -301,18 +301,22 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
llvm_unreachable("Unknown BuiltinID for wave reduction");
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
return Intrinsic::amdgcn_wave_reduce_add;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
return Intrinsic::amdgcn_wave_reduce_sub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
return Intrinsic::amdgcn_wave_reduce_min;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
return Intrinsic::amdgcn_wave_reduce_umin;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
return Intrinsic::amdgcn_wave_reduce_max;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
@@ -335,11 +339,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::SyncScope::ID SSID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index 039d03237b530..a8856ab56a55d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -412,6 +412,13 @@ void test_wave_reduce_add_u64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_add_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_add_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_iterative(global int* out, int in)
@@ -426,6 +433,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_add_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_add_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_dpp(global int* out, int in)
@@ -440,6 +454,13 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_add_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.add.f32(
+void test_wave_reduce_add_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_sub_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_default(global int* out, int in)
@@ -454,6 +475,13 @@ void test_wave_reduce_sub_u64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_sub_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
+void test_wave_reduce_sub_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_sub_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_iterative(global int* out, int in)
@@ -468,6 +496,13 @@ void test_wave_reduce_sub_u64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_sub_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
+void test_wave_reduce_sub_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_sub_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_dpp(global int* out, int in)
@@ -482,6 +517,13 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_sub_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.sub.f32(
+void test_wave_reduce_sub_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_and_b32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_b32_default(global int* out, int in)
@@ -622,6 +664,13 @@ void test_wave_reduce_min_i64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_min_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
+void test_wave_reduce_min_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_min_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_iterative(global int* out, int in)
@@ -636,6 +685,13 @@ void test_wave_reduce_min_i64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_min_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
+void test_wave_reduce_min_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_min_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_dpp(global int* out, int in)
@@ -650,6 +706,13 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_min_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.min.f32(
+void test_wave_reduce_min_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_min_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_min_u32_default(global int* out, int in)
@@ -706,6 +769,13 @@ void test_wave_reduce_max_i64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_max_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
+void test_wave_reduce_max_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_max_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_iterative(global int* out, int in)
@@ -720,6 +790,13 @@ void test_wave_reduce_max_i64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_max_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
+void test_wave_reduce_max_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_max_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_dpp(global int* out, int in)
@@ -734,6 +811,13 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_max_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.max.f32(
+void test_wave_reduce_max_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_max_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_max_u32_default(global int* out, int in)