[clang] [AMDGPU] Modifies raytracing and wmma builtin def to take _Float16 for HIP/C++ (PR #175039)

Thu Jan 8 09:56:45 PST 2026

llvmbot wrote:




@llvm/pr-subscribers-backend-amdgpu

Author: Rana Pratap Reddy (ranapratap55)

<details>
<summary>Changes</summary>

For raytrace and wmma builtins, using 'x' in the def to take _Float16 for HIP/C++ and half for OpenCL.

---

Patch is 39.74 KiB, truncated to 20.00 KiB below, full version: https://github.com/llvm/llvm-project/pull/175039.diff


6 Files Affected:

- (modified) clang/include/clang/Basic/BuiltinsAMDGPU.def (+12-12) 
- (added) clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip (+62) 
- (added) clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip (+62) 
- (added) clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip (+96) 
- (added) clang/test/CodeGenHIP/builtins-amdgcn-wmma-w32.hip (+89) 
- (added) clang/test/CodeGenHIP/builtins-amdgcn-wmma-w64.hip (+90) 


``````````diff

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index bb823704c84c8..f189e34aac707 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -325,9 +325,9 @@ TARGET_BUILTIN(__builtin_amdgcn_s_ttracedata_imm, "vIs", "n", "gfx10-insts")
 // Postfix h indicates the 4/5-th arguments are half4.
 //===----------------------------------------------------------------------===//
 TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray, "V4UiUifV4fV4fV4fV4Ui", "nc", "gfx10-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4hV4hV4Ui", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_h, "V4UiUifV4fV4xV4xV4Ui", "nc", "gfx10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_l, "V4UiWUifV4fV4fV4fV4Ui", "nc", "gfx10-insts")
-TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4hV4hV4Ui", "nc", "gfx10-insts")
+TARGET_BUILTIN(__builtin_amdgcn_image_bvh_intersect_ray_lh, "V4UiWUifV4fV4xV4xV4Ui", "nc", "gfx10-insts")
 
 
 //===----------------------------------------------------------------------===//
@@ -343,20 +343,20 @@ TARGET_BUILTIN(__builtin_amdgcn_s_wait_event_export_ready, "v", "n", "gfx11-inst
 // Postfix w32 indicates the builtin requires wavefront size of 32.
 // Postfix w64 indicates the builtin requires wavefront size of 64.
 //===----------------------------------------------------------------------===//
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16hV16hV8f", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32, "V8fV16xV16xV8f", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32, "V8fV16sV16sV8f", "nc", "gfx11-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32, "V16xV16xV16xV16xIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16hV16hV16hV16hIb", "nc", "gfx11-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w32, "V16xV16xV16xV16xIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w32, "V16sV16sV16sV16sIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32, "V8iIbV4iIbV4iV8iIb", "nc", "gfx11-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32, "V8iIbV2iIbV2iV8iIb", "nc", "gfx11-insts,wavefrontsize32")
 
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16hV16hV4f", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64, "V4fV16xV16xV4f", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64, "V4fV16sV16sV4f", "nc", "gfx11-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64, "V8xV16xV16xV8xIb", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8hV16hV16hV8hIb", "nc", "gfx11-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_tied_w64, "V8xV16xV16xV8xIb", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_tied_w64, "V8sV16sV16sV8sIb", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64, "V4iIbV4iIbV4iV4iIb", "nc", "gfx11-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64, "V4iIbV2iIbV2iV4iIb", "nc", "gfx11-insts,wavefrontsize64")
@@ -590,9 +590,9 @@ TARGET_BUILTIN(__builtin_amdgcn_ds_bvh_stack_push8_pop2_rtn, "V2WUiUiUiV8UiIi",
 // Therefore, we add an "_gfx12" suffix to distinguish them from the existing
 // builtins.
 //===----------------------------------------------------------------------===//
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8hV8hV8f", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12, "V8fV8xV8xV8f", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12, "V8fV8sV8sV8f", "nc", "gfx12-insts,wavefrontsize32")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8hV8hV8hV8h", "nc", "gfx12-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12, "V8xV8xV8xV8x", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w32_gfx12, "V8sV8sV8sV8s", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w32_gfx12, "V8iIbiIbiV8iIb", "nc", "gfx12-insts,wavefrontsize32")
@@ -604,9 +604,9 @@ TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_fp8_w32_gfx12, "V8fV2iV2iV
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf8_bf8_w32_gfx12, "V8fV2iV2iV8f", "nc", "gfx12-insts,wavefrontsize32")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x32_iu4_w32_gfx12, "V8iIbV2iIbV2iV8iIb", "nc", "gfx12-insts,wavefrontsize32")
 
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4hV4hV4f", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12, "V4fV4xV4xV4f", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x16_bf16_w64_gfx12, "V4fV4sV4sV4f", "nc", "gfx12-insts,wavefrontsize64")
-TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4hV4hV4hV4h", "nc", "gfx12-insts,wavefrontsize64")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12, "V4xV4xV4xV4x", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x16_bf16_w64_gfx12, "V4sV4sV4sV4s", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu8_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
 TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x16_iu4_w64_gfx12, "V4iIbiIbiV4iIb", "nc", "gfx12-insts,wavefrontsize64")
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip
new file mode 100644
index 0000000000000..6e4ec6bf8c107
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w32.hip
@@ -0,0 +1,62 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+#define __device__ __attribute__((device))
+
+typedef float    v8f   __attribute__((ext_vector_type(8)));
+typedef _Float16 v8h   __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12_hipPDv8_fDv8_DF16_S1_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <8 x float>, align 32, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    store <8 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 32
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <8 x float>, ptr [[C_ADDR_ASCAST]], align 32
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <8 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v8f32.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x float> [[TMP2]])
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <8 x float> [[TMP3]], ptr [[TMP4]], align 32
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12_hip(v8f* out, v8h a, v8h b, v8f c) {
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12_hipPDv8_DF16_S_S_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <8 x half> noundef [[A:%.*]], <8 x half> noundef [[B:%.*]], <8 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <8 x half>, align 16, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <8 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    store <8 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    store <8 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <8 x half>, ptr [[A_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <8 x half>, ptr [[B_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <8 x half>, ptr [[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <8 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v8f16.v8f16(<8 x half> [[TMP0]], <8 x half> [[TMP1]], <8 x half> [[TMP2]], i1 false)
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <8 x half> [[TMP3]], ptr [[TMP4]], align 16
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12_hip(v8h* out, v8h a, v8h b, v8h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w32_gfx12(a, b, c);
+}
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip
new file mode 100644
index 0000000000000..21bae28f85e8a
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-gfx12-wmma-w64.hip
@@ -0,0 +1,62 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1200 -target-feature +wavefrontsize64 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1200
+
+#define __device__ __attribute__((device))
+
+typedef float    v4f   __attribute__((ext_vector_type(4)));
+typedef _Float16 v4h   __attribute__((ext_vector_type(4)));
+
+// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12_hipPDv4_fDv4_DF16_S1_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x float> noundef [[C:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x float> [[C]], ptr [[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[B_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[C_ADDR_ASCAST]], align 16
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <4 x float> @llvm.amdgcn.wmma.f32.16x16x16.f16.v4f32.v4f16(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x float> [[TMP2]])
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x float> [[TMP3]], ptr [[TMP4]], align 16
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12_hip(v4f* out, v4h a, v4h b, v4f c) {
+  *out = __builtin_amdgcn_wmma_f32_16x16x16_f16_w64_gfx12(a, b, c);
+}
+
+// CHECK-GFX1200-LABEL: define dso_local void @_Z47test_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12_hipPDv4_DF16_S_S_S_(
+// CHECK-GFX1200-SAME: ptr noundef [[OUT:%.*]], <4 x half> noundef [[A:%.*]], <4 x half> noundef [[B:%.*]], <4 x half> noundef [[C:%.*]]) #[[ATTR0]] {
+// CHECK-GFX1200-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[A_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[B_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[C_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-GFX1200-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[A_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[A_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[B_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[B_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    [[C_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[C_ADDR]] to ptr
+// CHECK-GFX1200-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[A]], ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[B]], ptr [[B_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[C]], ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP0:%.*]] = load <4 x half>, ptr [[A_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP1:%.*]] = load <4 x half>, ptr [[B_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP2:%.*]] = load <4 x half>, ptr [[C_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    [[TMP3:%.*]] = call contract <4 x half> @llvm.amdgcn.wmma.f16.16x16x16.f16.v4f16.v4f16(<4 x half> [[TMP0]], <4 x half> [[TMP1]], <4 x half> [[TMP2]], i1 false)
+// CHECK-GFX1200-NEXT:    [[TMP4:%.*]] = load ptr, ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1200-NEXT:    store <4 x half> [[TMP3]], ptr [[TMP4]], align 8
+// CHECK-GFX1200-NEXT:    ret void
+//
+__device__ void test_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12_hip(v4h* out, v4h a, v4h b, v4h c) {
+  *out = __builtin_amdgcn_wmma_f16_16x16x16_f16_w64_gfx12(a, b, c);
+}
diff --git a/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip b/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip
new file mode 100644
index 0000000000000..1f3c65201da30
--- /dev/null
+++ b/clang/test/CodeGenHIP/builtins-amdgcn-raytracing.hip
@@ -0,0 +1,96 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 6
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-amd-amdhsa -target-cpu gfx1030 -emit-llvm -fcuda-is-device -o - %s | FileCheck %s --check-prefix=CHECK-GFX1030
+
+#define __device__ __attribute__((device))
+
+typedef unsigned int  v4ui  __attribute__((ext_vector_type(4)));
+typedef float         v4f   __attribute__((ext_vector_type(4)));
+typedef _Float16      v4h   __attribute__((ext_vector_type(4)));
+typedef unsigned long ulong;
+
+// CHECK-GFX1030-LABEL: define dso_local void @_Z34test_image_bvh_intersect_ray_h_hipPDv4_jjfDv4_fDv4_DF16_S2_S_(
+// CHECK-GFX1030-SAME: ptr noundef [[OUT:%.*]], i32 noundef [[NODE:%.*]], float noundef [[TMAX:%.*]], <4 x float> noundef [[ORIGIN:%.*]], <4 x half> noundef [[DIR:%.*]], <4 x half> noundef [[INV_DIR:%.*]], <4 x i32> noundef [[EXT:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-GFX1030-NEXT:  [[ENTRY:.*:]]
+// CHECK-GFX1030-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[NODE_ADDR:%.*]] = alloca i32, align 4, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[TMAX_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[ORIGIN_ADDR:%.*]] = alloca <4 x float>, align 16, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[DIR_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[INV_DIR_ADDR:%.*]] = alloca <4 x half>, align 8, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[EXT_ADDR:%.*]] = alloca <4 x i32>, align 16, addrspace(5)
+// CHECK-GFX1030-NEXT:    [[OUT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[OUT_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[NODE_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[NODE_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[TMAX_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[TMAX_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[ORIGIN_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ORIGIN_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[DIR_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[INV_DIR_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[INV_DIR_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    [[EXT_ADDR_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[EXT_ADDR]] to ptr
+// CHECK-GFX1030-NEXT:    store ptr [[OUT]], ptr [[OUT_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store i32 [[NODE]], ptr [[NODE_ADDR_ASCAST]], align 4
+// CHECK-GFX1030-NEXT:    store float [[TMAX]], ptr [[TMAX_ADDR_ASCAST]], align 4
+// CHECK-GFX1030-NEXT:    store <4 x float> [[ORIGIN]], ptr [[ORIGIN_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    store <4 x half> [[DIR]], ptr [[DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store <4 x half> [[INV_DIR]], ptr [[INV_DIR_ADDR_ASCAST]], align 8
+// CHECK-GFX1030-NEXT:    store <4 x i32> [[EXT]], ptr [[EXT_ADDR_ASCAST]], align 16
+// CHECK-GFX1030-NEXT:    [[TMP0:%.*]] = load i32, ptr [[NODE_ADDR_ASCAST]], align 4
+// CHECK-GFX1030-NEXT:    [[TMP1:%.*]] = load float, ptr [[TMAX_ADDR_ASCAST]], align 4
+// CHECK-GFX1030-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr ...
[truncated]

``````````

</details>


https://github.com/llvm/llvm-project/pull/175039