[clang] [AMDGPU][Clang] Add builtins for gfx12 ray tracing intrinsics (PR #135224)

Thu Apr 10 10:52:38 PDT 2025

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Shilei Tian (shiltian)

<details>
<summary>Changes</summary>

__builtin_amdgcn_image_bvh8_intersect_ray
__builtin_amdgcn_image_bvh_dual_intersect_ray

For the above two builtins, the second and third return values of the intrinsics
are returned through pointer-type function arguments.

__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn
__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn
__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn

For the last builtin, the intrinsic returns `{i64, i32}`, the builtin returns
`<2 x i64>`. The second return value of the intrinsic is zext'ed.

---
Full diff: https://github.com/llvm/llvm-project/pull/135224.diff


3 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+12) 
- (modified) clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp (+64-2) 
- (modified) clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl (+78) 


``````````diff

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index cbef637be213a..39fef9e4601f8 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -506,6 +506,18 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_tr_b128_v4bf16, "V4yV4y*1", "nc", "g
 
 TARGET_BUILTIN(__builtin_amdgcn_ds_bpermute_fi_b32, "iii", "nc", "gfx12-insts")
 
+// For the following two builtins, the second and third return values of the
+// intrinsics are returned through the last two pointer-type function arguments.
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh8_intersect_ray, "V10UiWUifUcV3fV3fUiV4UiV3f*V3f*", "nc", "gfx12-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh_dual_intersect_ray, "V10UiWUifUcV3fV3fV2UiV4UiV3f*V3f*", "nc", "gfx12-insts")
+
+TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn, "V2UiUiUiV4UiIi", "n", "gfx11-insts")
+TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn, "V2UiUiUiV8UiIi", "n", "gfx12-insts")
+
+// The intrinsic returns {i64, i32}, the builtin returns <2 x i64>.
+// The second return value of the intrinsic is zext'ed.
+TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi", "n", "gfx12-insts")
+
 //===----------------------------------------------------------------------===//
 // WMMA builtins.
 // Postfix w32 indicates the builtin requires wavefront size of 32.
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index b56b739094ff3..35c9f8ae48c80 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -616,19 +616,81 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
     return Builder.CreateCall(F, {NodePtr, RayExtent, RayOrigin, RayDir,
                                   RayInverseDir, TextureDescr});
   }
+  case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
+  case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray: {
+    Intrinsic::ID IID;
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_image_bvh8_intersect_ray:
+      IID = Intrinsic::amdgcn_image_bvh8_intersect_ray;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_image_bvh_dual_intersect_ray:
+      IID = Intrinsic::amdgcn_image_bvh_dual_intersect_ray;
+      break;
+    }
+    llvm::Value *NodePtr = EmitScalarExpr(E->getArg(0));
+    llvm::Value *RayExtent = EmitScalarExpr(E->getArg(1));
+    llvm::Value *InstanceMask = EmitScalarExpr(E->getArg(2));
+    llvm::Value *RayOrigin = EmitScalarExpr(E->getArg(3));
+    llvm::Value *RayDir = EmitScalarExpr(E->getArg(4));
+    llvm::Value *Offset = EmitScalarExpr(E->getArg(5));
+    llvm::Value *TextureDescr = EmitScalarExpr(E->getArg(6));
+
+    Address RetRayOriginPtr = EmitPointerWithAlignment(E->getArg(7));
+    Address RetRayDirPtr = EmitPointerWithAlignment(E->getArg(8));
+
+    llvm::Function *IntrinsicFunc = CGM.getIntrinsic(IID);
+
+    llvm::CallInst *CI = Builder.CreateCall(
+        IntrinsicFunc, {NodePtr, RayExtent, InstanceMask, RayOrigin, RayDir,
+                        Offset, TextureDescr});
+
+    llvm::Value *RetVData = Builder.CreateExtractValue(CI, 0);
+    llvm::Value *RetRayOrigin = Builder.CreateExtractValue(CI, 1);
+    llvm::Value *RetRayDir = Builder.CreateExtractValue(CI, 2);
+
+    Builder.CreateStore(RetRayOrigin, RetRayOriginPtr);
+    Builder.CreateStore(RetRayDir, RetRayDirPtr);
+
+    return RetVData;
+  }
+
+  case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
+  case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
+  case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
+  case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn: {
+    Intrinsic::ID IID;
+    switch (BuiltinID) {
+    case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn:
+      IID = Intrinsic::amdgcn_ds_bvh_stack_rtn;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn:
+      IID = Intrinsic::amdgcn_ds_bvh_stack_push4_pop1_rtn;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn:
+      IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop1_rtn;
+      break;
+    case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn:
+      IID = Intrinsic::amdgcn_ds_bvh_stack_push8_pop2_rtn;
+      break;
+    }
 
-  case AMDGPU::BI__builtin_amdgcn_ds_bvh_stack_rtn: {
     SmallVector<Value *, 4> Args;
     for (int i = 0, e = E->getNumArgs(); i != e; ++i)
       Args.push_back(EmitScalarExpr(E->getArg(i)));
 
-    Function *F = CGM.getIntrinsic(Intrinsic::amdgcn_ds_bvh_stack_rtn);
+    Function *F = CGM.getIntrinsic(IID);
     Value *Call = Builder.CreateCall(F, Args);
     Value *Rtn = Builder.CreateExtractValue(Call, 0);
     Value *A = Builder.CreateExtractValue(Call, 1);
     llvm::Type *RetTy = ConvertType(E->getType());
     Value *I0 = Builder.CreateInsertElement(PoisonValue::get(RetTy), Rtn,
                                             (uint64_t)0);
+    // ds_bvh_stack_push8_pop2_rtn returns {i64, i32} but the builtin returns
+    // <2 x i64>, zext the second value.
+    if (A->getType()->getPrimitiveSizeInBits() <
+        RetTy->getScalarType()->getPrimitiveSizeInBits())
+      A = Builder.CreateZExt(A, RetTy->getScalarType());
+
     return Builder.CreateInsertElement(I0, A, 1);
   }
   case AMDGPU::BI__builtin_amdgcn_mfma_scale_f32_16x16x128_f8f6f4:
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
index 7f73cdd61c80d..2cf7f3dc6f80e 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-raytracing.cl
@@ -3,6 +3,10 @@
 // RUN:   -emit-llvm -cl-std=CL2.0 -o - %s | FileCheck %s
 // RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1030 -S \
 // RUN:   -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=ISA %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -emit-llvm \
+// RUN:   -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12 %s
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1200 -S \
+// RUN:   -cl-std=CL2.0 -o - %s | FileCheck -check-prefix=GFX12ISA %s
 
 // Test llvm.amdgcn.image.bvh.intersect.ray intrinsic.
 
@@ -12,12 +16,18 @@
 // Postfix l indicates the 1st argument is i64 and postfix h indicates
 // the 4/5-th arguments are half4.
 
+typedef unsigned char uchar;
 typedef unsigned int uint;
 typedef unsigned long ulong;
+typedef float float3 __attribute__((ext_vector_type(3)));
 typedef float float4 __attribute__((ext_vector_type(4)));
 typedef double double4 __attribute__((ext_vector_type(4)));
 typedef half half4 __attribute__((ext_vector_type(4)));
+typedef uint uint2 __attribute__((ext_vector_type(2)));
 typedef uint uint4 __attribute__((ext_vector_type(4)));
+typedef uint uint8 __attribute__((ext_vector_type(8)));
+typedef uint uint10 __attribute__((ext_vector_type(10)));
+typedef ulong ulong2 __attribute__((ext_vector_type(2)));
 
 // CHECK: call <4 x i32> @llvm.amdgcn.image.bvh.intersect.ray.i32.v3f32
 // ISA: image_bvh_intersect_ray
@@ -59,3 +69,71 @@ void test_image_bvh_intersect_ray_lh(global uint4* out, ulong node_ptr,
            ray_origin, ray_dir, ray_inv_dir, texture_descr);
 }
 
+#if __has_builtin(__builtin_amdgcn_image_bvh8_intersect_ray)
+// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh8.intersect.ray(
+// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
+// GFX12: <3 x float> %ray_dir, i32 %offset, <4 x i32> %texture_descr)
+// GFX12ISA: image_bvh8_intersect_ray
+void test_image_bvh8_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
+    float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
+    float3 ray_origin, float3 ray_dir, uint offset, uint4 texture_descr)
+{
+  *ret_vdata = __builtin_amdgcn_image_bvh8_intersect_ray(node_ptr, ray_extent,
+           instance_mask, ray_origin, ray_dir, offset, texture_descr,
+           ret_ray_origin, ret_ray_dir);
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_image_bvh_dual_intersect_ray)
+// GFX12: call { <10 x i32>, <3 x float>, <3 x float> } @llvm.amdgcn.image.bvh.dual.intersect.ray(
+// GFX12: i64 %node_ptr, float %ray_extent, i8 %instance_mask, <3 x float> %ray_origin,
+// GFX12: <3 x float> %ray_dir, <2 x i32> %offset, <4 x i32> %texture_descr)
+// GFX12ISA: image_bvh_dual_intersect_ray
+void test_builtin_amdgcn_image_bvh_dual_intersect_ray(global uint10* ret_vdata, float3* ret_ray_origin,
+    float3* ret_ray_dir, ulong node_ptr, float ray_extent, uchar instance_mask,
+    float3 ray_origin, float3 ray_dir, uint2 offset, uint4 texture_descr)
+{
+  *ret_vdata = __builtin_amdgcn_image_bvh_dual_intersect_ray(node_ptr, ray_extent,
+           instance_mask, ray_origin, ray_dir, offset, texture_descr,
+           ret_ray_origin, ret_ray_dir);
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn)
+// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push4.pop1.rtn(
+// GFX12: i32 %addr, i32 %data0, <4 x i32> %data1, i32 0)
+// GFX12ISA: ds_bvh_stack_push4_pop1_rtn
+void test_builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(uint* ret_vdst, uint* ret_addr,
+    uint addr, uint data0, uint4 data1)
+{
+  uint2 ret = __builtin_amdgcn_ds_bvh_stack_push4_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
+  *ret_vdst = ret.x;
+  *ret_addr = ret.y;
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn)
+// GFX12: call { i32, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop1.rtn(
+// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+// GFX12ISA: ds_bvh_stack_push8_pop1_rtn
+void test_builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(uint* ret_vdst, uint* ret_addr,
+    uint addr, uint data0, uint8 data1)
+{
+  uint2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop1_rtn(addr, data0, data1, /*constant offset=*/0);
+  *ret_vdst = ret.x;
+  *ret_addr = ret.y;
+}
+#endif
+
+#if __has_builtin(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn)
+// GFX12: call { i64, i32 } @llvm.amdgcn.ds.bvh.stack.push8.pop2.rtn(
+// GFX12: i32 %addr, i32 %data0, <8 x i32> %data1, i32 0)
+// GFX12ISA: ds_bvh_stack_push8_pop2_rtn
+void test_builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(ulong* ret_vdst, uint* ret_addr,
+    uint addr, uint data0, uint8 data1)
+{
+  ulong2 ret = __builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn(addr, data0, data1, /*constant offset=*/0);
+  *ret_vdst = ret.x;
+  *ret_addr = ret.y;
+}
+#endif

``````````

</details>


https://github.com/llvm/llvm-project/pull/135224