[clang] 4604762 - [AMDGPU] Add builtins for wave reduction intrinsics (#161816)
via cfe-commits
cfe-commits at lists.llvm.org
Mon Nov 24 01:43:15 PST 2025
Author: Aaditya
Date: 2025-11-24T15:13:11+05:30
New Revision: 4604762cc336317b0f02f7d8c1576f6205f4ea61
URL: https://github.com/llvm/llvm-project/commit/4604762cc336317b0f02f7d8c1576f6205f4ea61
DIFF: https://github.com/llvm/llvm-project/commit/4604762cc336317b0f02f7d8c1576f6205f4ea61.diff
LOG: [AMDGPU] Add builtins for wave reduction intrinsics (#161816)
Added:
Modified:
clang/include/clang/Basic/BuiltinsAMDGPU.def
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
clang/test/CodeGenOpenCL/builtins-amdgcn.cl
Removed:
################################################################################
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 81e684a04a03d..a3ded0f6a9983 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -402,6 +402,10 @@ BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fadd_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fsub_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fmin_f32, "ffZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_fmax_f32, "ffZi", "nc")
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 92ee69218620b..81b3fe9e79483 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -390,18 +390,26 @@ static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u64:
return Intrinsic::amdgcn_wave_reduce_add;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
+ return Intrinsic::amdgcn_wave_reduce_fadd;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u64:
return Intrinsic::amdgcn_wave_reduce_sub;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
+ return Intrinsic::amdgcn_wave_reduce_fsub;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
return Intrinsic::amdgcn_wave_reduce_min;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
+ return Intrinsic::amdgcn_wave_reduce_fmin;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
return Intrinsic::amdgcn_wave_reduce_umin;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
return Intrinsic::amdgcn_wave_reduce_max;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
+ return Intrinsic::amdgcn_wave_reduce_fmax;
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
return Intrinsic::amdgcn_wave_reduce_umax;
@@ -423,11 +431,15 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
llvm::SyncScope::ID SSID;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_fadd_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_fsub_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmin_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_fmax_f32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index b92454de60c78..a5132c9114673 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -405,6 +405,13 @@ void test_wave_reduce_add_u64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_add_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_iterative(global int* out, int in)
@@ -419,6 +426,13 @@ void test_wave_reduce_add_u64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_add_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
void test_wave_reduce_add_u32_dpp(global int* out, int in)
@@ -433,6 +447,13 @@ void test_wave_reduce_add_u64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_add_u64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_fadd_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fadd.f32(
+void test_wave_reduce_fadd_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fadd_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_sub_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_default(global int* out, int in)
@@ -447,6 +468,13 @@ void test_wave_reduce_sub_u64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_sub_u32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_iterative(global int* out, int in)
@@ -461,6 +489,13 @@ void test_wave_reduce_sub_u64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_sub_u32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
void test_wave_reduce_sub_u32_dpp(global int* out, int in)
@@ -475,6 +510,13 @@ void test_wave_reduce_sub_u64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_sub_u64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_fsub_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fsub.f32(
+void test_wave_reduce_fsub_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fsub_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_and_b32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
void test_wave_reduce_and_b32_default(global int* out, int in)
@@ -615,6 +657,13 @@ void test_wave_reduce_min_i64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_min_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_iterative(global int* out, int in)
@@ -629,6 +678,13 @@ void test_wave_reduce_min_i64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_min_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
void test_wave_reduce_min_i32_dpp(global int* out, int in)
@@ -643,6 +699,13 @@ void test_wave_reduce_min_i64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_fmin_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmin.f32(
+void test_wave_reduce_fmin_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fmin_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_min_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
void test_wave_reduce_min_u32_default(global int* out, int in)
@@ -699,6 +762,13 @@ void test_wave_reduce_max_i64_default(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
}
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_default
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_default(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_max_i32_iterative
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_iterative(global int* out, int in)
@@ -713,6 +783,13 @@ void test_wave_reduce_max_i64_iterative(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
}
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_iterative
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_iterative(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_max_i32_dpp
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
void test_wave_reduce_max_i32_dpp(global int* out, int in)
@@ -727,6 +804,13 @@ void test_wave_reduce_max_i64_dpp(global int* out, long in)
*out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
}
+// CHECK-LABEL: @test_wave_reduce_fmax_f32_dpp
+// CHECK: {{.*}}call{{.*}} float @llvm.amdgcn.wave.reduce.fmax.f32(
+void test_wave_reduce_fmax_f32_dpp(global float* out, float in)
+{
+ *out = __builtin_amdgcn_wave_reduce_fmax_f32(in, 0);
+}
+
// CHECK-LABEL: @test_wave_reduce_max_u32_default
// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
void test_wave_reduce_max_u32_default(global int* out, int in)
More information about the cfe-commits
mailing list