[llvm-branch-commits] [clang] [llvm] [AMDGPU] Add builtins for wave reduction intrinsics (PR #150170)
via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Jul 28 02:07:54 PDT 2025
https://github.com/easyonaadit updated https://github.com/llvm/llvm-project/pull/150170
>From 01432a07787d4067b4625c2df8882b04faa073c7 Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Sat, 19 Jul 2025 12:57:27 +0530
Subject: [PATCH 1/2] Add builtins for wave reduction intrinsics
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 25 ++
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 58 +++
clang/test/CodeGenOpenCL/builtins-amdgcn.cl | 378 +++++++++++++++++++
3 files changed, 461 insertions(+)
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 878543566f0e3..c8b324193e9fb 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -351,6 +351,31 @@ BUILTIN(__builtin_amdgcn_endpgm, "v", "nr")
BUILTIN(__builtin_amdgcn_get_fpenv, "WUi", "n")
BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
+//===----------------------------------------------------------------------===//
+
+// Wave Reduction builtins.
+
+//===----------------------------------------------------------------------===//
+
+BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "iii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "iii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "iii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "UiUii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "iii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "UiUii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "iii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "iii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "iii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_i64, "WiWii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_i64, "WiWii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_i64, "WiWii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_u64, "WUiWUii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_i64, "WiWii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWii", "nc")
+
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 7dccf82b1a7a3..28ea918b97cc5 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -295,11 +295,69 @@ void CodeGenFunction::AddAMDGPUFenceAddressSpaceMMRA(llvm::Instruction *Inst,
Inst->setMetadata(LLVMContext::MD_mmra, MMRAMetadata::getMD(Ctx, MMRAs));
}
+static Intrinsic::ID getIntrinsicIDforWaveReduction(unsigned BuiltinID) {
+ switch (BuiltinID) {
+ default:
+ llvm_unreachable("Unknown BuiltinID for wave reduction");
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i64:
+ return Intrinsic::amdgcn_wave_reduce_add;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i64:
+ return Intrinsic::amdgcn_wave_reduce_sub;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+ return Intrinsic::amdgcn_wave_reduce_min;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
+ return Intrinsic::amdgcn_wave_reduce_umin;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+ return Intrinsic::amdgcn_wave_reduce_max;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
+ return Intrinsic::amdgcn_wave_reduce_umax;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
+ return Intrinsic::amdgcn_wave_reduce_and;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
+ return Intrinsic::amdgcn_wave_reduce_or;
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
+ case clang::AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64:
+ return Intrinsic::amdgcn_wave_reduce_xor;
+ }
+}
+
Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
const CallExpr *E) {
llvm::AtomicOrdering AO = llvm::AtomicOrdering::SequentiallyConsistent;
llvm::SyncScope::ID SSID;
switch (BuiltinID) {
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b32:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_add_i64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_sub_i64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_i64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_min_u64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_i64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_max_u64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_and_b64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_or_b64:
+ case AMDGPU::BI__builtin_amdgcn_wave_reduce_xor_b64: {
+ Intrinsic::ID IID = getIntrinsicIDforWaveReduction(BuiltinID);
+ llvm::Value *Value = EmitScalarExpr(E->getArg(0));
+ llvm::Value *Strategy = EmitScalarExpr(E->getArg(1));
+ llvm::Function *F = CGM.getIntrinsic(IID, {Value->getType()});
+ return Builder.CreateCall(F, {Value, Strategy});
+ }
case AMDGPU::BI__builtin_amdgcn_div_scale:
case AMDGPU::BI__builtin_amdgcn_div_scalef: {
// Translate from the intrinsics's struct return to the builtin's out
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
index bf022bc6eb446..16f5a524f3094 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn.cl
@@ -398,6 +398,384 @@ void test_s_sendmsghalt_var(int in)
__builtin_amdgcn_s_sendmsghalt(1, in);
}
+// CHECK-LABEL: @test_wave_reduce_add_i32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
+void test_wave_reduce_add_i32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_add_i64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64(
+void test_wave_reduce_add_i64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_add_i32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
+void test_wave_reduce_add_i32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_add_i64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64(
+void test_wave_reduce_add_i64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_add_i32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.add.i32(
+void test_wave_reduce_add_i32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_add_i64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.add.i64(
+void test_wave_reduce_add_i64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_add_i64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
+void test_wave_reduce_sub_i32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64(
+void test_wave_reduce_sub_i64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
+void test_wave_reduce_sub_i32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64(
+void test_wave_reduce_sub_i64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.sub.i32(
+void test_wave_reduce_sub_i32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_sub_i64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.sub.i64(
+void test_wave_reduce_sub_i64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_sub_i64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
+void test_wave_reduce_and_b32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64(
+void test_wave_reduce_and_b64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
+void test_wave_reduce_and_b32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64(
+void test_wave_reduce_and_b64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.and.i32(
+void test_wave_reduce_and_b32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_and_b64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.and.i64(
+void test_wave_reduce_and_b64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_and_b64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
+void test_wave_reduce_or_b32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64(
+void test_wave_reduce_or_b64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
+void test_wave_reduce_or_b32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64(
+void test_wave_reduce_or_b64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.or.i32(
+void test_wave_reduce_or_b32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_or_b64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.or.i64(
+void test_wave_reduce_or_b64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_or_b64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
+void test_wave_reduce_xor_b32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64(
+void test_wave_reduce_xor_b64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
+void test_wave_reduce_xor_b32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64(
+void test_wave_reduce_xor_b64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.xor.i32(
+void test_wave_reduce_xor_b32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_xor_b64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.xor.i64(
+void test_wave_reduce_xor_b64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_xor_b64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
+void test_wave_reduce_min_i32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64(
+void test_wave_reduce_min_i64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
+void test_wave_reduce_min_i32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64(
+void test_wave_reduce_min_i64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.min.i32(
+void test_wave_reduce_min_i32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_i64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.min.i64(
+void test_wave_reduce_min_i64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_i64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
+void test_wave_reduce_min_u32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64(
+void test_wave_reduce_min_u64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
+void test_wave_reduce_min_u32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64(
+void test_wave_reduce_min_u64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umin.i32(
+void test_wave_reduce_min_u32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_min_u64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umin.i64(
+void test_wave_reduce_min_u64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_min_u64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
+void test_wave_reduce_max_i32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64(
+void test_wave_reduce_max_i64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
+void test_wave_reduce_max_i32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64(
+void test_wave_reduce_max_i64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.max.i32(
+void test_wave_reduce_max_i32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_i64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.max.i64(
+void test_wave_reduce_max_i64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_i64(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u32_default
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
+void test_wave_reduce_max_u32_default(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u32(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u64_default
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64(
+void test_wave_reduce_max_u64_default(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u64(in, 0);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u32_iterative
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
+void test_wave_reduce_max_u32_iterative(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u32(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u64_iterative
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64(
+void test_wave_reduce_max_u64_iterative(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u64(in, 1);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u32_dpp
+// CHECK: {{.*}}call{{.*}} i32 @llvm.amdgcn.wave.reduce.umax.i32(
+void test_wave_reduce_max_u32_dpp(global int* out, int in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u32(in, 2);
+}
+
+// CHECK-LABEL: @test_wave_reduce_max_u64_dpp
+// CHECK: {{.*}}call{{.*}} i64 @llvm.amdgcn.wave.reduce.umax.i64(
+void test_wave_reduce_max_u64_dpp(global int* out, long in)
+{
+ *out = __builtin_amdgcn_wave_reduce_max_u64(in, 2);
+}
+
// CHECK-LABEL: @test_s_barrier
// CHECK: {{.*}}call{{.*}} void @llvm.amdgcn.s.barrier(
void test_s_barrier()
>From dcaea7230ffd22bb56eeb2eb8956c28164b909fa Mon Sep 17 00:00:00 2001
From: Aaditya <Aaditya.AlokDeshpande at amd.com>
Date: Mon, 28 Jul 2025 14:35:08 +0530
Subject: [PATCH 2/2] Using `int32_t` inplace of `int`
---
a.out | Bin 0 -> 22264 bytes
clang/include/clang/Basic/BuiltinsAMDGPU.def | 36 ++++++------
hip.sh | 20 +++++++
test.cpp | 58 +++++++++++++++++++
4 files changed, 96 insertions(+), 18 deletions(-)
create mode 100755 a.out
create mode 100644 hip.sh
create mode 100644 test.cpp
diff --git a/a.out b/a.out
new file mode 100755
index 0000000000000000000000000000000000000000..2dbcd9ad6edc6908ee25aacddc07417f96ca46f2
GIT binary patch
literal 22264
zcmeHPdvIJ=c|Te|A`)AY^8k^A<Pzi{6SS6ONnQs+R+eP164_QQCzt at ay4r`dWVO5O
z-gWGRViU&!Rk&k9NnOgcX$OKKJRBaW5 at uvg?8Zr(b__H00RdcwbXE-%)6$U1P}JY=
z-1Duj*1M4z@(0t=k at kM)`<?Im&bi+`_uTWi_w4T**b=CzQ4#{`U5eb1t(Jzu`lwWB
zhlzlMRY+Ze<F)EdY9;vP8uQY_%F+>`V*w8rTI;cD87TFRne<};OQ#6ET60PXl at gLt
zZ$4Ni>d_+`Jyxh{@ObL2z)x^FKcNj5d2&MidOXL6-{qx0e5a)&LK{u_hu3)N<$6<L
z51-0fIwJIC4^~rj1cNCj%`v|jli!TVPv{xr2%+N7oiDgC`sG5)DgGQZdS^PJjBuvY
zT0mo(Q%b1#{tuwXa{jh~`uv?TemiBZyHLv*6e{_xPbEjXI at YIREvaO>u(xGzsH>%`
zBbd(w+qFHqkLR>?`)%3+-D<MXe6<60F0GJw88E8?ejPCW%0mc1C46fY+y-7L-+fi^
zi7NP!D)^_X;GeC6KU)QV0~}PToNumzZ?1xGuY$j?3ZAZlKT-vss)F-AsZ{<;P&q2$
z4OQ at P6?|J2JXQt2rwV=o{$Hc6RN<V37 at 7l=n;`~Dd^m4uUhrmPXK=fR>qLDB;67M}
zdtw>dyA_~3u22`DRjI{qCF0(1CDQow(wV~hxdwlqgzYfm-aWNUeQ5z-qkaN at xRf4H
zuNWSX%Dr`EJzLt(0%LI3J%}c1rTP|l+MC$F4zY{WX~R1&|0!a%>bp2H3|_zQ5{0t=
zm*ERnCSS)EV>xbU>VM-C522U=)xaAZfo|e6g7ixKlzOAGz!1Cq=&GKk at l`$R`o24$
zY#S{|)Xuw3&b7y#WE#lmcqVNS8*DB<n#{ZLoSn!y<8eEgPGo>a_d0eWnRZgid*WbY
z at ycRj$?RYz+vB8CJ(+YOIa<g$ZZeZrzz3W{Iy!c1JeQ8A)HozEQNr8iWAX85b{~w>
zNFhIVz7_k9A-5x%DY%NUwl3Sw<lS7{8AmDe at toTq1=<!Jb8 at zubCPbpKib!~6WE3k
zC!dUpbpI}USF7DO)MXFeI<&z~!P at qrOe)`(WbD=<w=?SQ%f_MHw!z+J-<^!b)5sy|
z_U-XRH)IQLU(}0tCQ>LXQc&tGxj3Ao3mMPlbS#zczpyezccDa`D3SKc*0|f3%Vlyy
zZZ4S~)s}d=?pQ8!w>{#-?5LA>SsFMbo{pt{>oZsJy-B)p`;Z$-X7aF^k9YLr9DCZ#
zS(f3w8rSu}aXn>TsGX*^Y;oMps2(|U{Yo&dxvl{Es@|5CS~D+>`sj?uV99%qZKNOV
z4m{70kauIz8*bp^txIhk=-=G4d$+wI*x`={H!a}W+Nd`gMHQ!7o7&pb1C>^yQJ*2Z
zJ=h-X^i&bw(4p36vhMm^COY1-G1wMt<&$=ODz#^P{YWysJ{UCrKxt~+Go_9UTfEs!
zo1vrnaMXvJTUg@{`*3sXYW$cFmuEc_j{9)-+ayT)NKE>MOq2GJ_?rlraMtv@*uC at T
zTlC at ne9!rCf4(pK at CE&z4_Z1W+BaQD&DZ*H>7y~B!H1(^D}_cMesKv^%JSi87)znq
zhp#H3O11j%)jmAr!|Qx_*oULxE`^8>Zz!Q?pMCfmAHLg%`=6`BKAe3j31dF|asffJ
zKKu$FzSoDp-iIIX;cxKaAM at eY`0yh>ywQgr_2F;y;Sc+8 at A={-9`oT>`Q&R%uX8wJ
zY4q)uiu4a!XcRNK8!cryu7wV4+)~MOEV(BcD>x}@kCQ6I^GJ2$dDp&6KPT+Fojq|p
zkz$2t+;(CyyU^9)%VsIM<bl`31Ge2W(7k=Dy<^LkfgRnw_U7BR_YU;gcB$=dSeT96
zOJnGNa1ds9b+i=Hccn9Tr_IQ0G+k(s(d?ADg|9V6 at Pymb(SoPmXf#d2-&o#hX&FuI
z-PGzV%xZ~S^1zY at mOQZJfh7+td0@!{OCDJAz>)`+Jh0?}B at Zll;5G8VesBFhEOVgC
z)LFfjBId&;ve!)Z-Hb_5uK&Ld$2F>6&;L?a<jU#GHJ$;XG8al%^7JGY9>ZpmRjAA;
zP*2ax%4Dld`WY%GBGj)(lVLN78ot5HPiWR;KO==Q^B&BPddH34 at ppKFLXFzI&x{`D
zPc9 at s$yfX$>-ZO1PUBwnyROHFSYU*5dcfd9#XWR?3TYN-_qN`xgIIPy_!VzDC+JOK
z=0EX6b?Ku)XEd9$v2>Mo)#rjvZZ!YN`@a&zjAkyL&)fN|6OBJIn#zngsm}*9i9|k5
zu{_q=Y99^iiBbD5OqG7w6R}4Mi9|eiP(*7z(dB;8V<Y=)tyq#;^=eX21V?k3Le|d5
zN5|u77fXG6<1w2BP><^qL6J*|SOgqR*M9RnoZ<~%2&SC$Xu%ncAKQ_Qr+Wshp2td%
zy(gZ_V?FS}fSL-9JA3U!%5m+xGr7A=_OS2|%W>*yEIpoBWFq)m!E7$M2a8dqg854Q
zXfR*$_$7~dOg@`TrOZY6d@#Rnd?b_l*XtI^C3shCDp*J-6Pet&E~!0V)S9mc3;B57
zj_pHvvA&39)#zPM;WCu64m{>D{;a<m<g9igmw}(@vhbHpg~6iiXxx1QT;+*DsaWE~
z8dED#`N{(Chf9~CcAEA;Ce7are9bc4k-RAp!uqIGFX$R|v6A~)?k7_Dao)e77OND?
zxwfv<t{d))9Dn<YFWh?QmCJZP at aJD!0$U1{*M(ig*^ABn<6?O0!*;ZgjM*DEY)W**
z+B at 31I@;U2I^--%+5XlM(`?ZhuNuPYtLo#B at ULCHs>1Ji;(axN8MeIvwQ>eTeYobn
z>A>pyrted#Y5&Z}Ppk|C9t;EmA65sl&wGvhNARk~fu-M)50>h{QXM#99w%zbbc|$;
zUe=c79fC0vk+uC{HZ}<V^P9O0X`@LuW>i=dndlGbY*|ruG%WKlwA%5~MzD1>=~~VC
z{Me1_*CRSs7- at -*7uG|%Wgs~^=BDv7O-p|oZ#UshCjELxOE#DJKs at SNR(ofAsNL#{
zb%r`)p^fc{k=TZ4=cbOBv(ecQZQbNVH;uHn$Kwe+Z8j#l)~T}W+e_Ia<FJ=`Fq#>^
zQNKjOgJczCAzRxzHncmf8#jhF#yVQNHjZ?3wRc9_V~J2G)IQSMk#LZ>A-Zv$8c3$m
zlf2Oy80fXybuL!(Yn+AOvkT&UZY%YRx$=(2+YGLW?=WO=8J0-WM+`2!(63d&zh4Dk
ztqb&vInfu|T?Nln!T+iX{>>`*vsLhS(lLQrCF3V4&krD9ssHuAz$^8?EZC{k|5Cwp
z#*M2WI+*CLsvuUCgJVv9OqnBbH>ZNROw4f|6~v2Ku9jwZ6mUG1%h*{AGx!o&a3r5s
zL2sB6jOINJ!?MGQ{&;e{Fb)}1TaE%0`^c^{0y>6lyqHH9?Wm;-<2Jj&7bM8R(S-<(
z;H^W9N-jj&H#G2uF^lv(y^j2zXmrtX=pMfwx8$REyD>Vps0OYWhX}eHC_V=X7o?Mm
zM+<tmbV1U5l?ECvUJXp86bBL6f*xa)E=3upy<Uc0h;J7y%_5z2yZ1fYySMfC$j$OQ
zP~MhoYovXuHg1<J<hdd|b@{9keR+Nf9X5h|3^6a6pOEJo-xYglZZ0GG^86CoZ1g2v
z^bL%+biH{TNtfrH&~kk#ztC%;&oP(i%kx#}Fw*>i=im=-1w%~qWn3sU>=#D7oL>)&
z`ttmj at us{4Sek!=j1FZ5=mbbz88=EhD)T-vFC^Un at t=&-_<0C1(LZkHvt*uXP#gsr
zhO+F{Hed_%WnRm4V9Mpo`JEMPG9SsGaepY+caT_)QT{^p_Za<FD*1xw?<g}atkC_?
zDP4P?zA`UO9yNmWH{X{O|4aS<C@}H8U;U_gx$@8aAmWm*z at I>*l75?c3A3-PaOL`2
z2Kyr#Yp}$mrZrbBdq)g0Ud9=W<qv?K13e1L4<5b*5(a$^^a$urK^yR5<|-Jo at M0y6
zwt~8#M?gOgIs?k at G|qy47L)<AAv)CDvs2aVZK%0w)kU>OkVc&E<%W>w^_1wnNLN`N
zI7=}IT<SFCBKQpe3xTTcx`qb=J*!vdVGJzO$M8!b{T9$tdZ14WF#TireFo_+(|hZN
z10!`~y>;3CIvAkMWBC0V>6g&io9i07>l%CO4%8f6_Q!$1f7E at frnl}$O+SFZx9S>^
z{pPybZgAeW20wwy8Jv*x*?Dd+<M%$i)n<XRJa-@?;`}b-8;HD at cyC>6U}qg?j8rr5
zUn6oI at ojYtyNrAocn9p@5-bS+U4P9hZ9nUu`1YNVi62J}{^Z=?uD-UJwr3-UZ)$_e
z+!fz%RO*+BVEwiCYdy-Jd~QD67MXbAr4=u%LydC-FTVBZ72jhtGP&aG(2E>=S-F?N
z)Vuz3ZZ2}T^EA?muPj&U{U^nqr~fmfzv<}}KQ{XJivFs|<V_DWL-E^Cgy9d&Cmg=%
zg`dsM6`#=-y)s1({&aU_;zLcL3Abta#N|zq!@W(R$V6{bn6c%IMIsYtNe3f`Kh)G5
zIqWtyMh@?4YKR;jXsRtv>RjBW=E+ at 6t&z!rrpDgM-X?1RiQyBwS6gb%ZK}nuvG_qs
z9y~R6a^5?MBi*+Tp$KCqkfw?@B_>XuR#9+~)4ffoB#0o}+G#DY9mhj6^@k3^)sacS
zPZEwy{O}0?s(7<W{t}Z<5aMr|hseY$S_T)ZukG35S}j at FA3<4b^+_T()&~Wuzv1EO
z<IMGGBZ;xy#2OeIW|=~C1^ltA$)YP7iw97LBa?1Z2(Adz70q--D_t=tu4sfSnu`fa
z9z4}RX*q`<J;5fWcrTK8iZ`LuC-^q~^gcN8Nxqw1+>f-$4H_*TM0|RfNmPqWoGv;v
zI=SLvT>HpmxEPr{+M2gi<k0u)@8?%v2ks3icV+$KJv}eJJ~Hv6;!Z8T4AE)cKy!nw
z_0#7(+gq6gi#xOk++4g3F_!8#kZhabReW#vohMK5t^VmK3Q_FUDkt@|DsDt}P!DVD
zN0EuAi)#^`&Onf68#R7<8Z}qV{u~FNonPZ@`X`>QaB)YF0_7J$1}1(*qh}B;89fOG
z#;5t5o#oecKBJ#Unl^f#w^t(tMjrw(M(;(mWHb$iM)?$bF;d(Q)@OE5r=HYhEp7k<
zO9L9y=iG*9$x at TXPfu@xrQ&J$hmN=zpsuDxK2GYd?S%p>^lSnG3C-i?XvPh_8f|st
z&|Km26H}e=_U)dv$i#|Yu7%>eXX at 9WX4bD+-Zafy?#aVAJi$Zv)Eok9Jj-`tc)()4
zYo5hwke459KL!)9?OrihyoPp9ulREy3=pmLH(qEm22V7gP8L6h0-oTvZPR3*oXJ{h
z_v~?iZ0{nISFC74J$Y&NFMvGvoW>6Rv=)`~Z&8mUhi2Ts+!<}$lV=*ULY8x#-+K#=
zYoEul{@O5N<i}W=*|R*CwcjBRyK44~=Kqd7%B^N+H2)O&W#o@*{z>xMs}F1bYvlQe
zn?0iWFOp|hZ}x!Z|C&6z-?LfGf0{fSqS;~1KS=&{<OenXN91`2&4x99A9+6BXInLY
z5BXK(EzRf1vllblp!pAwXRl;dX?}z}dl|Fmegk0b9pw3Vo;|Dio#gSv#QAIf7V>;d
z&CY0kGkG=_v&S`mBY8G$vkz;&ojjgwIDgH*jXZh|IDgGwN1m%QvsukwNgmG-oWJHT
zCC}l(>>&6vTP;M<7eUR^!O(_?$IG`qS1t3BdEprN4I_i_{D#rb_m#=>n?`>+ziDLP
ziTDFA97E`0OH=&>LF}03=GR4dLJYDzeb&euvnp#MrWooO)Nls<i~yB<SaBGN0ZUU}
zcm}c0ipEe}X=y5b6*jF5GQT)u(&<<R`F{N>k1g4gZ(OGCv~&vlSPXKWtY{3U%ynQz
zV>oO0a~KnnKeNIjQMiPfZ!r0z#%Z26%xe+YGBC*Z at tW}+R&KL2_j*IGH?+&pEr#B1
zXu{AvhCX2ELxw(L=;MYyZRkH4`tOFmWa!H6CVxY(H?+&pEr#B1Xu{AvhCX2ELxys(
zeQExe^V_e<6tl0UER3?<;I~|-%zfuwMRe_v5D3((8PhJw{m1)qIp&KOT2YD(UjXmP
zNIMoX0v2cuzF405AY=3G6w_-k;tX|JgiGNT#3=uakq?`c_k+^Dw1<>q7{!nBwRn6o
zNIUq4I(}K{{+IjuevMzQ<h~dDS5T}8G%C5z1fPU_rStt0;4II%TP?)q$4-wUPF&h?
zi-`#Th#$+z$2^0S^9P7iUYT}9kE?+FT+3Xk8jZY^=W>=)ty1j1Fi88j4mj<wXToru
zJ}UJN;FZe%6dI<Qz`XyZoyxOdfj5 at gX=6tH3k3pKoY($5&3pq5rFK%}pVRo7QhO=*
z1F*w-b=;Iw#)aPg2=JS!Z`T`zDC}H{^EzvugO`~0o;?=}XD|DVi2VuRoM#_gZD~l(
zi}M_km&lTS5qLudJAVV*@(Y^ycYup;{mI2DI9{cwD1S9 at o|k+-=^A6VYTR$xi^qZU
zyx1*ekos__iu|Xl;Ge02KT-uhUIqVq;Fb1q;Qj`c{QMSNw at P?R6?}UY{EjL(d#aV}
zXMtDRSK^*3@?WchpQ?gCUj=^yDqf}XTwMk4FnIYs7Xww~-%|z8R>2Qf!5;xm|5#=1
zk_sldh?@1*r-I#^`&(QGD@^`%Dzh-_Em|4I##+_<8dhobDxPwvsu!%BU^a!7Ct1<r
zZU70##Ygy9^TG}bdYgeE?=3YIjE|XJD#l{4ES#)^amxl3#I6-d<ryVlJGq>*&x0k7
zyuAgp(G0U*xN7FDlOe^kp%>9EV1dlGospzXuS9e12zE(G2eD&>9n0vQB<5p5TLVeG
zvj*R8>WkQ0c6M*;vj_TzcG=vK!{|UVGs3lM57OtQ6B$DyNXdvkbqyu;a!oLr&0 at EQ
zflctCx8p{CvahWboZe=_b_WM{+2hLII<RAN_keBZ3nO;F%{4QlIphw6P|}Ps%*!&K
zi4{_DJL+bz)dly0u(9HV3uTWv*pVWB{@YoekasohyK_!98y8JuD at I%J7q+yZw_+)L
z`;C1<^gDhob`pu&*(7$@(A$?_4-r%fULC_r;qejgP#?DV=-!M^c$MA11L;@*wXtwV
z|8ge?PeV39@!Kfd4aGZM<7(Nw;?;LX$!ClMc$unflM;*XNk;MBq6;@`FTdLN?%swd
zm3CEGWKDBv*Of(gg86^G>xzG;mP(ticz0ywcPYzTqQ33DY>i&K#ar}K&@IwGbPIGi
z7O)KUco}zloF^-BQce(EgrDfkIOUW9ETs7Z9%l;z=Os915q+6oC>lU0^I%z}UnTu&
zuD^0DFZwdxKVt?&T%Tk>H$VrG6Z&?boEs2c<_8+ggvW8C$NX9L^73=J(!c&J^BXfp
zzuD+Z{!)Get3E`SCi%<!Nzv%b{E5iZ2mY`P49g_?G9MH&6EMe&aC!dXzjqt`%|=e<
zht3*(|NIc=Y^406?*L<&+Ks;S>jzhR0LO*?Ao_wO5uv`6Uo}`>5@!Mtrq+72bTqi|
z1w{P%GVYq{^dzL5B43{WN6Yl*_Z{>!Vxnkh`St%)nZAsdLtUQ2ZWhiL%JctAq*26N
z?$Y>5_J{0{fxWR&cKYgbW%@E7cIus;g3PNK_4DfQqh<Ose|FaBo69Kb=jlIIrhm?i
z8_yYi&XxN^dHFdW=cKUIPni#t{UaB<eosLcqw|u!%o|LZ8^7G&<vhpz^u6RS^LMiE
z<r!0c-v11uFX!>kz$ha6vLDrvFhGQ6kCcw0FTnGN5EK2WVM`}Xg*8zTo}4d~xl_G}
zG=Kgw-WGi+r<f?u{}*NYGXMKb*wZMAqM_ybuK at R#U*?gHcY6vPpZY`C&mhi;Dwe;s
z3>W>y-k+$2a#<Fhr5=y8nj7mHuvg<J<(GQJHBj-rr(yK^J$fAr=L==5y0VJ?&RaaC
Le=BP+%Ju&bz?d`u
literal 0
HcmV?d00001
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index c8b324193e9fb..a9bf747d0aaa1 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -357,24 +357,24 @@ BUILTIN(__builtin_amdgcn_set_fpenv, "vWUi", "n")
//===----------------------------------------------------------------------===//
-BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "iii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "iii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "iii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "UiUii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "iii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "UiUii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "iii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "iii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "iii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_add_i64, "WiWii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_sub_i64, "WiWii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_min_i64, "WiWii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_min_u64, "WUiWUii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_max_i64, "WiWii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWii", "nc")
-BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWii", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_i32, "ZiZiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_i32, "ZiZiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_i32, "ZiZiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_u32, "ZUiZUiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_i32, "ZiZiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_u32, "ZUiZUiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_and_b32, "ZiZiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_or_b32, "ZiZiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_xor_b32, "ZiZiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_add_i64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_sub_i64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_i64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_min_u64, "WUiWUiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_i64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_max_u64, "WUiWUiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_and_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_or_b64, "WiWiZi", "nc")
+BUILTIN(__builtin_amdgcn_wave_reduce_xor_b64, "WiWiZi", "nc")
//===----------------------------------------------------------------------===//
// R600-NI only builtins.
diff --git a/hip.sh b/hip.sh
new file mode 100644
index 0000000000000..b7bf7f67908ba
--- /dev/null
+++ b/hip.sh
@@ -0,0 +1,20 @@
+
+ "build/bin/clang-22" -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -E -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_isa_version_90a.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_abi_version_500.bc -target-cpu gfx90a -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include/cuda_wrappers -idirafter /opt/rocm-7.1.0/lib/llvm/bin/../../../include -include __clang_hip_runtime_wrapper.h -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-hip-amdgcn-amd-amdhsa-gfx90a.hipi -x hip test.cpp
+
+ "build/bin/clang-22" -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -emit-llvm-bc -emit-llvm-uselists -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_isa_version_90a.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_abi_version_500.bc -target-cpu gfx90a -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -fdeprecated-macro -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -disable-llvm-passes -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-hip-amdgcn-amd-amdhsa-gfx90a.bc -x hip-cpp-output test-hip-amdgcn-amd-amdhsa-gfx90a.hipi
+
+ "build/bin/clang-22" -cc1 -triple amdgcn-amd-amdhsa -aux-triple x86_64-unknown-linux-gnu -S -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model pic -pic-level 2 -fhalf-no-semantic-interposition -mframe-pointer=none -fno-rounding-math -mconstructor-aliases -aux-target-cpu x86-64 -fcuda-is-device -mllvm -amdgpu-internalize-symbols -fcuda-allow-variadic-functions -fvisibility=hidden -fapply-global-visibility-to-externs -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/hip.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ocml.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/ockl.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_daz_opt_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_unsafe_math_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_finite_only_off.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_correctly_rounded_sqrt_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_wavefrontsize64_on.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_isa_version_90a.bc -mlink-builtin-bitcode /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/amdgcn/bitcode/oclc_abi_version_500.bc -target-cpu gfx90a -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -fno-autolink -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -o test-hip-amdgcn-amd-amdhsa-gfx90a.s -x ir test-hip-amdgcn-amd-amdhsa-gfx90a.bc
+
+ "build/bin/clang-22" -cc1as -triple amdgcn-amd-amdhsa -filetype obj -main-file-name test.cpp -target-cpu gfx90a -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -dwarf-version=5 -mrelocation-model pic -o test-hip-amdgcn-amd-amdhsa-gfx90a.o test-hip-amdgcn-amd-amdhsa-gfx90a.s
+ "build/bin/lld" -flavor gnu -m elf64_amdgpu --no-undefined -shared -plugin-opt=-amdgpu-internalize-symbols -plugin-opt=mcpu=gfx90a -plugin-opt=O3 --lto-CGO3 -save-temps --whole-archive -o test-hip-amdgcn-amd-amdhsa-gfx90a.out test-hip-amdgcn-amd-amdhsa-gfx90a.o --no-whole-archive
+ "build/bin/clang-offload-bundler" -type=o -bundle-align=4096 -targets=host-x86_64-unknown-linux-gnu,hipv4-amdgcn-amd-amdhsa--gfx90a -input=/dev/null -input=test-hip-amdgcn-amd-amdhsa-gfx90a.out -output=test.cpp-hip-amdgcn-amd-amdhsa.hipfb -verbose
+ "build/bin/clang-22" -cc1 -triple x86_64-unknown-linux-gnu -aux-triple amdgcn-amd-amdhsa -E -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -fcoverage-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include/cuda_wrappers -idirafter /opt/rocm-7.1.0/lib/llvm/bin/../../../include -include __clang_hip_runtime_wrapper.h -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/x86_64-linux-gnu/c++/12 -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../include/c++/12/backward -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -internal-isystem /opt/rocm-7.1.0/lib/llvm/lib/clang/21/include -internal-isystem /usr/local/include -internal-isystem /usr/lib/gcc/x86_64-linux-gnu/12/../../../../x86_64-linux-gnu/include -internal-externc-isystem /usr/include/x86_64-linux-gnu -internal-externc-isystem /include -internal-externc-isystem /usr/include -O3 -fdeprecated-macro -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-host-x86_64-unknown-linux-gnu.hipi -x hip test.cpp
+
+ "build/bin/clang-22" -cc1 -triple x86_64-unknown-linux-gnu -aux-triple amdgcn-amd-amdhsa -emit-llvm-bc -emit-llvm-uselists -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -fcoverage-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -fdeprecated-macro -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -fcxx-exceptions -fexceptions -vectorize-loops -vectorize-slp -disable-llvm-passes -fcuda-include-gpubinary test.cpp-hip-amdgcn-amd-amdhsa.hipfb -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-host-x86_64-unknown-linux-gnu.bc -x hip-cpp-output test-host-x86_64-unknown-linux-gnu.hipi
+
+ "build/bin/clang-22" -cc1 -triple x86_64-unknown-linux-gnu -aux-triple amdgcn-amd-amdhsa -S -dumpdir a.out- -save-temps=cwd -disable-free -clear-ast-before-backend -disable-llvm-verifier -discard-value-names -main-file-name test.cpp -mrelocation-model static -mframe-pointer=none -fmath-errno -fno-rounding-math -mconstructor-aliases -funwind-tables=2 -target-cpu x86-64 -tune-cpu generic -debugger-tuning=gdb -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -v -fcoverage-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -resource-dir /opt/rocm-7.1.0/lib/llvm/lib/clang/21 -O3 -ferror-limit 19 -fhip-new-launch-api -fgnuc-version=4.2.1 -vectorize-loops -vectorize-slp -cuid=b19930c6dbd68254 -fcuda-allow-variadic-functions -faddrsig -D__GCC_HAVE_DWARF2_CFI_ASM=1 -o test-host-x86_64-unknown-linux-gnu.s -x ir test-host-x86_64-unknown-linux-gnu.bc
+
+ "build/bin/clang-22" -cc1as -triple x86_64-unknown-linux-gnu -filetype obj -main-file-name test.cpp -target-cpu x86-64 -fdebug-compilation-dir=/home/aalokdes/dockerx/work/llvm-trunk/llvm-project -dwarf-version=5 -mrelocation-model static -o test-host-x86_64-unknown-linux-gnu.o test-host-x86_64-unknown-linux-gnu.s
+ "build/bin/ld.lld" -z relro --hash-style=gnu --eh-frame-hdr -m elf_x86_64 -dynamic-linker /lib64/ld-linux-x86-64.so.2 -o a.out /lib/x86_64-linux-gnu/crt1.o /lib/x86_64-linux-gnu/crti.o /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/clang_rt.crtbegin-x86_64.o -L/usr/lib/gcc/x86_64-linux-gnu/12 -L/usr/lib/gcc/x86_64-linux-gnu/12/../../../../lib64 -L/lib/x86_64-linux-gnu -L/lib/../lib64 -L/usr/lib/x86_64-linux-gnu -L/usr/lib/../lib64 -L/lib -L/usr/lib --enable-new-dtags test-host-x86_64-unknown-linux-gnu.o -L/opt/rocm-7.1.0/lib/llvm/bin/../../../lib -rpath /opt/rocm-7.1.0/lib/llvm/bin/../../../lib -lamdhip64 -lstdc++ -lm /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/libclang_rt.builtins-x86_64.a -lgcc_s -lc /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/libclang_rt.builtins-x86_64.a -lgcc_s /opt/rocm-7.1.0/lib/llvm/lib/clang/21/lib/linux/clang_rt.crtend-x86_64.o /lib/x86_64-linux-gnu/crtn.o
+
+ ./a.out
diff --git a/test.cpp b/test.cpp
new file mode 100644
index 0000000000000..5400171e32d0f
--- /dev/null
+++ b/test.cpp
@@ -0,0 +1,58 @@
+#include <hip/hip_runtime.h>
+#include <iostream>
+
+using namespace std;
+
+#define HIP_CHECK(status) \
+ if (status != hipSuccess) { \
+ std::cerr << "HIP error: " << hipGetErrorString(status) \
+ << " at line " << __LINE__ << std::endl; \
+ std::exit(EXIT_FAILURE); \
+ }
+
+
+extern "C" __global__ void test_kernel_wave_reduce_add_u64(int32_t* a, int32_t N, int num_active_lanes) {
+ int32_t threadID = threadIdx.x ;
+ int32_t reduced_val = __builtin_amdgcn_wave_reduce_add_i32(N, 0); // uniform value + between waves
+ // test with : std::numeric_limits<unsigned int>::max()
+
+ // for(int i = 0; i < num_active_lanes; i++) a[i] = reduced_val;
+ a[threadID] = reduced_val;
+ // a[thre] = thre + 10;
+}
+
+
+int main() {
+ int num_active_lanes = 1;
+ // std::unique_ptr<int32_t[]> h1 = std::make_unique<int32_t[]>(num_active_lanes);
+ int32_t *h1 = (int32_t *)malloc(sizeof(int32_t) * num_active_lanes);
+
+ // std::unique_ptr<int32_t> val_to_reduce = std::make_unique<int32_t>(10);
+ int32_t *val_to_reduce = (int32_t *)malloc(sizeof(int32_t));
+ *val_to_reduce = std::numeric_limits<uint32_t>::min();
+
+ // initialize the memory
+ for (int i = 0; i < num_active_lanes; i++) {
+ h1[i] = 99;
+ }
+
+ size_t size = num_active_lanes * sizeof(int32_t);
+ int32_t* d1 = nullptr;
+ HIP_CHECK(hipMalloc(&d1, size));
+ // HIP_CHECK(hipMemcpy(d1, h1.get(), size, hipMemcpyHostToDevice));
+ HIP_CHECK(hipMemcpy(d1, h1, size, hipMemcpyHostToDevice));
+ // std::cout << "before kernel: " << std::endl;
+ hipLaunchKernelGGL(test_kernel_wave_reduce_add_u64, dim3(1), dim3(num_active_lanes), 0, 0, d1, *val_to_reduce, num_active_lanes);
+ // std::cout << "after kernel: "<< std::endl;
+ // HIP_CHECK(hipMemcpy(h1.get(), d1, size, hipMemcpyDeviceToHost));
+ HIP_CHECK(hipMemcpy(h1, d1, size, hipMemcpyDeviceToHost));
+
+
+ std::cout << "individual values: ";
+ for(int i = 0; i < 1; i++){
+ std::cout << h1[i] << ", ";
+ // std::cout << std::hex << h1[i] << ", ";
+ }
+ std::cout << std::endl;
+ HIP_CHECK(hipFree(d1));
+}
\ No newline at end of file
More information about the llvm-branch-commits
mailing list