[llvm-branch-commits] [clang] [llvm] [AMDGPU] Builtin support for wmma_f64_16x16x4_f64 (PR #203144)
Stanislav Mekhanoshin via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Wed Jun 10 18:47:34 PDT 2026
https://github.com/rampitec updated https://github.com/llvm/llvm-project/pull/203144
>From b9b64c5f99592b33cbbb6c9339a0a83954d2e879 Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Date: Wed, 10 Jun 2026 17:32:05 -0700
Subject: [PATCH] [AMDGPU] Builtin support for wmma_f64_16x16x4_f64
---
clang/include/clang/Basic/BuiltinsAMDGPU.td | 6 ++++++
.../include/clang/Basic/BuiltinsAMDGPUDocs.td | 15 +++++++++++++++
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 5 +++++
.../CodeGen/amdgpu-builtin-is-invocable.c | 2 +-
.../CodeGen/amdgpu-builtin-processor-is.c | 2 +-
.../CodeGenCXX/dynamic-cast-address-space.cpp | 4 ++--
.../builtins-amdgcn-gfx1251-wmma-w32.cl | 19 +++++++++++++++++++
...ins-amdgcn-error-gfx1251-wmma-w32-param.cl | 17 +++++++++++++++++
llvm/lib/TargetParser/AMDGPUTargetParser.cpp | 2 ++
9 files changed, 68 insertions(+), 4 deletions(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1251-wmma-w32.cl
create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1251-wmma-w32-param.cl
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.td b/clang/include/clang/Basic/BuiltinsAMDGPU.td
index 8eed188b0f4b2..ccbf2f97a1313 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.td
@@ -1224,6 +1224,12 @@ def __builtin_amdgcn_swmmac_i32_16x16x128_iu8 : AMDGPUBuiltin<"_ExtVector<8, int
def __builtin_amdgcn_swmmac_f32_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, float>(_Constant bool, _ExtVector<16, _Float16>, _Constant bool, _ExtVector<32, _Float16>, _ExtVector<8, float>, int, _Constant bool, _Constant bool)", [Const], "swmmac-gfx1250-insts,wavefrontsize32">;
def __builtin_amdgcn_swmmac_f16_16x16x64_f16 : AMDGPUBuiltin<"_ExtVector<8, _Float16>(_Constant bool, _ExtVector<16, _Float16>, _Constant bool, _ExtVector<32, _Float16>, _ExtVector<8, _Float16>, int, _Constant bool, _Constant bool)", [Const], "swmmac-gfx1250-insts,wavefrontsize32">;
+// GFX1251 WMMA builtins
+def __builtin_amdgcn_wmma_f64_16x16x4_f64 : AMDGPUBuiltin<"_ExtVector<8, double>(_Constant bool, _ExtVector<2, double>, _Constant bool, _ExtVector<2, double>, _Constant short, _ExtVector<8, double>, _Constant bool, _Constant bool)", [Const], "gfx1251-gemm-insts,wavefrontsize32"> {
+ let Documentation = [DocWMMA_f64_16x16x4_f64_GFX1251];
+ let ArgNames = ["a_neg", "a", "b_neg", "b", "c_mod", "c", "matrix_a_reuse", "matrix_b_reuse"];
+}
+
// GFX12.5 128B cooperative atomics
def __builtin_amdgcn_cooperative_atomic_load_32x4B : AMDGPUBuiltin<"int(int *, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
def __builtin_amdgcn_cooperative_atomic_store_32x4B : AMDGPUBuiltin<"void(int *, int, _Constant int, char const *)", [Const], "gfx1250-insts,wavefrontsize32">;
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
index 293431c5de7e8..5b8d14818c0c8 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
+++ b/clang/include/clang/Basic/BuiltinsAMDGPUDocs.td
@@ -645,6 +645,21 @@ scale variant.
}];
}
+def DocWMMA_f64_16x16x4_f64_GFX1251 : Documentation {
+ let Category = DocCatWMMA;
+ let Content = [{
+Wave matrix multiply-accumulate with f64 inputs and f64 output, using a
+4-deep K dimension. Multiplies a 16x4 matrix ``A`` by a 4x16 matrix ``B``
+and adds the 16x16 f64 accumulator ``C``.
+
+- ``a_neg`` / ``b_neg``: if true, negate the corresponding input matrix.
+- ``c_mod``: accumulator modifier (0 = none, 1 = neg, 2 = abs,
+ 3 = neg(abs)).
+- ``matrix_a_reuse`` / ``matrix_b_reuse``: hints to the hardware that
+ matrix A or B data can be reused from a previous WMMA instruction.
+}];
+}
+
//===----------------------------------------------------------------------===//
// Tensor DMA Builtins
//===----------------------------------------------------------------------===//
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 21f32b12c4fd1..68ead1d1c214d 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1443,6 +1443,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
// GFX1250 WMMA builtins
+ case AMDGPU::BI__builtin_amdgcn_wmma_f64_16x16x4_f64:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
@@ -1643,6 +1644,10 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
break;
// GFX1250 WMMA builtins
+ case AMDGPU::BI__builtin_amdgcn_wmma_f64_16x16x4_f64:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f64_16x16x4_f64;
+ break;
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
ArgsForMatchingMatrixTypes = {3, 0};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
diff --git a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
index 8de4b0ec8bc20..e8b59f7df34c8 100644
--- a/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
+++ b/clang/test/CodeGen/amdgpu-builtin-is-invocable.c
@@ -54,7 +54,7 @@ void foo() {
// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
// AMDGCN-GFX1010: attributes #[[ATTR1:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
//.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx1251-gemm-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
// AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
//.
diff --git a/clang/test/CodeGen/amdgpu-builtin-processor-is.c b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
index 8dfbb55566598..c9e6c80a321b9 100644
--- a/clang/test/CodeGen/amdgpu-builtin-processor-is.c
+++ b/clang/test/CodeGen/amdgpu-builtin-processor-is.c
@@ -63,7 +63,7 @@ void foo() {
//.
// AMDGCN-GFX1010: attributes #[[ATTR0]] = { convergent noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="gfx1010" }
//.
-// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// AMDGCNSPIRV: attributes #[[ATTR0]] = { noinline nounwind optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx1251-gemm-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
// AMDGCNSPIRV: attributes #[[ATTR1:[0-9]+]] = { nounwind }
// AMDGCNSPIRV: attributes #[[ATTR2:[0-9]+]] = { cold noreturn nounwind memory(inaccessiblemem: write) }
//.
diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index 8252a511d6b44..e5873c1a8de90 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -107,9 +107,9 @@ const B& f(A *a) {
// CHECK: attributes #[[ATTR3]] = { nounwind }
// CHECK: attributes #[[ATTR4]] = { noreturn }
//.
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR0]] = { mustprogress noinline optnone "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx1251-gemm-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR1:[0-9]+]] = { nounwind willreturn memory(read) }
-// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
+// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR2:[0-9]+]] = { "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-features"="+16-bit-insts,+add-min-max-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-buffer-pk-add-bf16-inst,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-fmin-fmax-global-f32,+atomic-fmin-fmax-global-f64,+atomic-global-pk-add-bf16-inst,+bf16-cvt-insts,+bf16-pk-insts,+bf16-trans-insts,+bf8-cvt-scale-insts,+bitop3-insts,+ci-insts,+clusters,+cube-insts,+cvt-pknorm-vop2-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot13-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+f32-to-f16bf16-cvt-sr-insts,+flat-global-insts,+fp4-cvt-scale-insts,+fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-cvt-scale-insts,+fp8-insts,+fp8e5m3-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx1250-insts,+gfx1251-gemm-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+gws,+image-insts,+lerp-inst,+mai-insts,+mcast-load-insts,+mqsad-insts,+mqsad-pk-insts,+msad-insts,+permlane16-swap,+permlane32-swap,+pk-add-min-max-insts,+prng-inst,+qsad-insts,+s-memrealtime,+s-memtime-inst,+s-wakeup-barrier-inst,+sad-insts,+setprio-inc-wg-inst,+swmmac-gfx1200-insts,+swmmac-gfx1250-insts,+tanh-insts,+tensor-cvt-lut-insts,+transpose-load-f4f6-insts,+vmem-pref-insts,+vmem-to-lds-load-insts,+wavefrontsize32,+wavefrontsize64,+wmma-128b-insts,+wmma-256b-insts,+xf32-insts" }
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR3]] = { nounwind }
// WITH-NONZERO-DEFAULT-AS: attributes #[[ATTR4]] = { noreturn }
//.
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1251-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1251-wmma-w32.cl
new file mode 100644
index 0000000000000..720a923cae542
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1251-wmma-w32.cl
@@ -0,0 +1,19 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1251 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1251
+
+typedef double v8d __attribute__((ext_vector_type(8)));
+typedef double v2d __attribute__((ext_vector_type(2)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1251-LABEL: @test_amdgcn_wmma_f64_16x16x4_f64(
+// CHECK-GFX1251-NEXT: entry:
+// CHECK-GFX1251-NEXT: [[TMP0:%.*]] = tail call <8 x double> @llvm.amdgcn.wmma.f64.16x16x4.f64.v8f64.v2f64(i1 false, <2 x double> [[A:%.*]], i1 false, <2 x double> [[B:%.*]], i16 0, <8 x double> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1251-NEXT: store <8 x double> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA7:![0-9]+]]
+// CHECK-GFX1251-NEXT: ret void
+//
+void test_amdgcn_wmma_f64_16x16x4_f64(global v8d* out, v2d a, v2d b, v8d c)
+{
+ *out = __builtin_amdgcn_wmma_f64_16x16x4_f64(0, a, 0, b, 0, c, false, true);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1251-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1251-wmma-w32-param.cl
new file mode 100644
index 0000000000000..c4d3d4499f8db
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1251-wmma-w32-param.cl
@@ -0,0 +1,17 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1251 -verify -emit-llvm -o - %s
+
+typedef double v8d __attribute__((ext_vector_type(8)));
+typedef double v2d __attribute__((ext_vector_type(2)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+
+void test_amdgcn_wmma_f64_16x16x4_f64(global v8d* out, v2d a, v2d b, v8d c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f64_16x16x4_f64(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f64_16x16x4_f64' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f64_16x16x4_f64(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f64_16x16x4_f64' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f64_16x16x4_f64(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f64_16x16x4_f64' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f64_16x16x4_f64(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f64_16x16x4_f64' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f64_16x16x4_f64(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f64_16x16x4_f64' must be a constant integer}}
+}
diff --git a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
index 24e6ece329c4c..d9ebd4f9ffd6d 100644
--- a/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
+++ b/llvm/lib/TargetParser/AMDGPUTargetParser.cpp
@@ -203,6 +203,8 @@ static void fillAMDGCNFeatureMap(StringRef GPU, const Triple &T,
AMDGPU::GPUKind Kind = parseArchAMDGCN(GPU);
switch (Kind) {
case GK_GFX1251:
+ Features["gfx1251-gemm-insts"] = true;
+ [[fallthrough]];
case GK_GFX1250:
Features["swmmac-gfx1200-insts"] = true;
Features["swmmac-gfx1250-insts"] = true;
More information about the llvm-branch-commits
mailing list