[clang] AMDGPU: Implement builtins for gfx1250 wmma instructions (PR #148991)
Changpeng Fang via cfe-commits
cfe-commits at lists.llvm.org
Tue Jul 15 16:50:33 PDT 2025
https://github.com/changpeng updated https://github.com/llvm/llvm-project/pull/148991
>From 074800e1906bcce1cc0110c759a6d141ce4ea322 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Tue, 15 Jul 2025 16:37:20 -0700
Subject: [PATCH 1/2] AMDGPU: Implement builtins for gfx1250 wmma instructions
Co-Authored-by: Stanislav Mekhanoshin <Stanislav.Mekhanoshin at amd.com>
Co-Authored-by: Shilei Tian <Shilei.Tian at amd.com>
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 40 ++
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 199 +++++++-
.../builtins-amdgcn-gfx1250-wmma-w32.cl | 433 ++++++++++++++++++
...ins-amdgcn-error-gfx1250-wmma-w32-param.cl | 242 ++++++++++
4 files changed, 913 insertions(+), 1 deletion(-)
create mode 100644 clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
create mode 100644 clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index 71e4b3486167a..29e1e99bba9ef 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -676,5 +676,45 @@ TARGET_BUILTIN(__builtin_amdgcn_cvt_f16_bf8, "hiIi", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_fp8, "V2hs", "nc", "gfx1250-insts")
TARGET_BUILTIN(__builtin_amdgcn_cvt_pk_f16_bf8, "V2hs", "nc", "gfx1250-insts")
+// GFX1250 WMMA builtins
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x4_f32, "V8fIbV2fIbV2fIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_bf16, "V8fIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16_16x16x32_bf16, "V8yIbV16yIbV16yIsV8yIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16, "V8yIbV16yIbV16yIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8, "V8fV8iV8iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8, "V8hV8iV8iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_i32_16x16x64_iu8, "V8iIbV8iIbV8iV8iIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8, "V8hV16iV16iIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8, "V8fV16iV16iIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_16x16x32_f16, "V8fIbV16hIbV16hIsV8fIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f16_16x16x32_f16, "V8hIbV16hIbV16hIsV8hIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_wmma_f32_32x16x128_f4, "V16fV16iV8iIsV16f", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16_16x16x64_bf16, "V8yIbV16yIbV32yV8yiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16, "V8fIbV16yIbV32yV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8, "V8fV8iV16iV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8, "V8hV8iV16iV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_i32_16x16x128_iu8, "V8iIbV8iIbV16iV8iiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f32_16x16x64_f16, "V8fIbV16hIbV32hV8fiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+TARGET_BUILTIN(__builtin_amdgcn_swmmac_f16_16x16x64_f16, "V8hIbV16hIbV32hV8hiIbIb", "nc", "gfx1250-insts,wavefrontsize32")
+
#undef BUILTIN
#undef TARGET_BUILTIN
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index 0d8c2ed284994..e1f9cbe7aea26 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -822,7 +822,46 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w32:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_fp8_w64:
case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w32:
- case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64: {
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x32_bf8_bf8_w64:
+ // GFX1250 WMMA builtins
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8: {
// These operations perform a matrix multiplication and accumulation of
// the form:
@@ -837,6 +876,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
// "false".
bool AppendFalseForOpselArg = false;
unsigned BuiltinWMMAOp;
+ // Need return type when D and C are of different types.
+ bool NeedReturnType = false;
switch (BuiltinID) {
case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x16_f16_w32:
@@ -975,6 +1016,160 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {2, 0, 1, 3}; // CD, A, B, Index
BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x32_bf8_bf8;
break;
+ // GFX1250 WMMA builtins
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x4_f32:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x4_f32;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_bf16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x32_f16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x32_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x32_f16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x32_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16_16x16x32_bf16:
+ ArgsForMatchingMatrixTypes = {5, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16:
+ NeedReturnType = true;
+ ArgsForMatchingMatrixTypes = {1, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_bf16f32_16x16x32_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x64_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x64_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {3, 0};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_i32_16x16x64_iu8:
+ ArgsForMatchingMatrixTypes = {4, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_i32_16x16x64_iu8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_32x16x128_f4:
+ ArgsForMatchingMatrixTypes = {3, 0, 1};
+ BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_32x16x128_f4;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_f16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x64_bf16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x64_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x64_f16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x64_f16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16_16x16x64_bf16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16_16x16x64_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_bf16f32_16x16x64_bf16;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f32_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_fp8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_fp8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8:
+ ArgsForMatchingMatrixTypes = {2, 0, 1, 3};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_f16_16x16x128_bf8_bf8;
+ break;
+ case AMDGPU::BI__builtin_amdgcn_swmmac_i32_16x16x128_iu8:
+ ArgsForMatchingMatrixTypes = {4, 1, 3, 5};
+ BuiltinWMMAOp = Intrinsic::amdgcn_swmmac_i32_16x16x128_iu8;
+ break;
}
SmallVector<Value *, 6> Args;
@@ -984,6 +1179,8 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
Args.push_back(Builder.getFalse());
SmallVector<llvm::Type *, 6> ArgTypes;
+ if (NeedReturnType)
+ ArgTypes.push_back(ConvertType(E->getType()));
for (auto ArgIdx : ArgsForMatchingMatrixTypes)
ArgTypes.push_back(Args[ArgIdx]->getType());
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
new file mode 100644
index 0000000000000..e4ef3defdb341
--- /dev/null
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx1250-wmma-w32.cl
@@ -0,0 +1,433 @@
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py
+// REQUIRES: amdgpu-registered-target
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -target-feature +wavefrontsize32 -emit-llvm -o - %s | FileCheck %s --check-prefix=CHECK-GFX1250
+
+typedef float v16f __attribute__((ext_vector_type(16)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef float v2f __attribute__((ext_vector_type(2)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef half v16h __attribute__((ext_vector_type(16)));
+typedef half v32h __attribute__((ext_vector_type(32)));
+typedef __bf16 v32bf16 __attribute__((ext_vector_type(32)));
+typedef __bf16 v16bf16 __attribute__((ext_vector_type(16)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
+typedef int v16i __attribute__((ext_vector_type(16)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x4_f32(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x4.f32.v8f32.v2f32(i1 false, <2 x float> [[A:%.*]], i1 false, <2 x float> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4:![0-9]+]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.bf16.v8f32.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 false)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, true, false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16_16x16x32_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16.16x16x32.bf16.v8bf16.v16bf16(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x bfloat> [[C:%.*]], i1 false, i1 false)
+// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8bf16 c)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, false);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_bf16f32_16x16x32_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.wmma.bf16f32.16x16x32.bf16.v8bf16.v16bf16.v8f32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <16 x bfloat> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, true, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_fp8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.fp8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.fp8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x64_bf8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x64.bf8.bf8.v8f32.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_fp8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.fp8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.fp8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x64_bf8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x64.bf8.bf8.v8f16.v8i32(<8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_i32_16x16x64_iu8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.wmma.i32.16x16x64.iu8.v8i32.v8i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <8 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x32_f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x32.f16.v8f32.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x32_f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x32.f16.v8f16.v16f16(i1 false, <16 x half> [[A:%.*]], i1 false, <16 x half> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.fp8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.fp8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f16_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.wmma.f16.16x16x128.bf8.bf8.v8f16.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x half> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, v8h c)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 true, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, true, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.fp8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.fp8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.wmma.f32.16x16x128.bf8.bf8.v8f32.v16i32(<16 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], i16 0, <8 x float> [[C:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, v8f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <16 x float> @llvm.amdgcn.wmma.f32.32x16x128.f4.v16f32.v16i32.v8i32(<16 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], i16 0, <16 x float> [[C:%.*]])
+// CHECK-GFX1250-NEXT: store <16 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 64, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c)
+{
+ *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, 0, c);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16_16x16x64_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x bfloat> @llvm.amdgcn.swmmac.bf16.16x16x64.bf16.v8bf16.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x bfloat> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x bfloat> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, v32bf16 b, v8bf16 c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_bf16f32_16x16x64_bf16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.bf16f32.16x16x64.bf16.v8f32.v16bf16.v32bf16.i32(i1 false, <16 x bfloat> [[A:%.*]], i1 false, <32 x bfloat> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_fp8_fp8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x128_fp8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.fp8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_fp8_bf8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x128_fp8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.fp8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_bf8_fp8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x128_bf8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x128.bf8.bf8.v8f32.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x128_bf8_bf8(global v8f* out, v8i a, v16i b, v8f c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x128_bf8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_fp8_fp8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x128_fp8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.fp8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_fp8_bf8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x128_fp8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.fp8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_bf8_fp8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x128_bf8_fp8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x128.bf8.bf8.v8f16.v8i32.v16i32.i32(<8 x i32> [[A:%.*]], <16 x i32> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f16_16x16x128_bf8_bf8(global v8h* out, v8i a, v16i b, v8h c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x128_bf8_bf8(a, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_i32_16x16x128_iu8(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x i32> @llvm.amdgcn.swmmac.i32.16x16x128.iu8.v8i32.v8i32.v16i32.i32(i1 false, <8 x i32> [[A:%.*]], i1 false, <16 x i32> [[B:%.*]], <8 x i32> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x i32> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f32_16x16x64_f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x float> @llvm.amdgcn.swmmac.f32.16x16x64.f16.v8f32.v16f16.v32f16.i32(i1 false, <16 x half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x float> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x float> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 32, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, false, true);
+}
+
+// CHECK-GFX1250-LABEL: @test_amdgcn_swmmac_f16_16x16x64_f16(
+// CHECK-GFX1250-NEXT: entry:
+// CHECK-GFX1250-NEXT: [[TMP0:%.*]] = tail call <8 x half> @llvm.amdgcn.swmmac.f16.16x16x64.f16.v8f16.v16f16.v32f16.i32(i1 false, <16 x half> [[A:%.*]], i1 false, <32 x half> [[B:%.*]], <8 x half> [[C:%.*]], i32 [[INDEX:%.*]], i1 false, i1 true)
+// CHECK-GFX1250-NEXT: store <8 x half> [[TMP0]], ptr addrspace(1) [[OUT:%.*]], align 16, !tbaa [[TBAA4]]
+// CHECK-GFX1250-NEXT: ret void
+//
+void test_amdgcn_swmmac_f16_16x16x64_f16(global v8h* out, v16h a, v32h b, v8h c, int index)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, false, true);
+}
diff --git a/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
new file mode 100644
index 0000000000000..55d705e6ad238
--- /dev/null
+++ b/clang/test/SemaOpenCL/builtins-amdgcn-error-gfx1250-wmma-w32-param.cl
@@ -0,0 +1,242 @@
+// REQUIRES: amdgpu-registered-target
+
+// RUN: %clang_cc1 -triple amdgcn-unknown-unknown -target-cpu gfx1250 -verify -emit-llvm -o - %s
+
+typedef float v16f __attribute__((ext_vector_type(16)));
+typedef float v8f __attribute__((ext_vector_type(8)));
+typedef float v2f __attribute__((ext_vector_type(2)));
+typedef half v8h __attribute__((ext_vector_type(8)));
+typedef half v16h __attribute__((ext_vector_type(16)));
+typedef half v32h __attribute__((ext_vector_type(32)));
+typedef __bf16 v32bf16 __attribute__((ext_vector_type(32)));
+typedef __bf16 v16bf16 __attribute__((ext_vector_type(16)));
+typedef __bf16 v8bf16 __attribute__((ext_vector_type(8)));
+typedef int v16i __attribute__((ext_vector_type(16)));
+typedef int v8i __attribute__((ext_vector_type(8)));
+
+void test_amdgcn_wmma_f32_16x16x4_f32(global v8f* out, v2f a, v2f b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x4_f32(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x4_f32' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x32_bf16(global v8f* out, v16bf16 a, v16bf16 b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_bf16_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8bf16 c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_bf16_16x16x32_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_bf16f32_16x16x32_bf16(global v8bf16* out, v16bf16 a, v16bf16 b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_bf16f32_16x16x32_bf16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_bf16f32_16x16x32_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_fp8_fp8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_fp8_bf8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_bf8_fp8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x64_bf8_bf8(global v8f* out, v8i a, v8i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x64_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_fp8_fp8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_fp8_bf8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_bf8_fp8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x64_bf8_bf8(global v8h* out, v8i a, v8i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x64_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_i32_16x16x64_iu8(global v8i* out, v8i a, v8i b, v8i c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(mod, a, 0, b, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, mod, b, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_i32_16x16x64_iu8(0, a, 0, b, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_i32_16x16x64_iu8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x32_f16(global v8f* out, v16h a, v16h b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x32_f16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x32_f16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x32_f16(global v8h* out, v16h a, v16h b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(mod, a, 0, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, mod, b, 0, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x32_f16(0, a, 0, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x32_f16' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_fp8_fp8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_fp8_bf8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_bf8_fp8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f16_16x16x128_bf8_bf8(global v8h* out, v16i a, v16i b, v8h c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f16_16x16x128_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_fp8_fp8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_fp8_bf8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_fp8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_bf8_fp8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_fp8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_16x16x128_bf8_bf8(global v8f* out, v16i a, v16i b, v8f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, mod, c, false, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, mod, false); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a constant integer}}
+ *out = __builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8(a, b, 0, c, false, mod); // expected-error {{'__builtin_amdgcn_wmma_f32_16x16x128_bf8_bf8' must be a constant integer}}
+}
+
+void test_amdgcn_wmma_f32_wmma_f32_32x16x128_f4(global v16f* out, v16i a, v8i b, v16f c, int mod)
+{
+ *out = __builtin_amdgcn_wmma_f32_32x16x128_f4(a, b, mod, c); // expected-error {{'__builtin_amdgcn_wmma_f32_32x16x128_f4' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_bf16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_bf16_16x16x64_bf16(global v8bf16* out, v16bf16 a, v32bf16 b, v8bf16 c, int index, int mod)
+{
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_bf16_16x16x64_bf16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_bf16_16x16x64_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_bf16f32_16x16x64_bf16(global v8f* out, v16bf16 a, v32bf16 b, v8f c, int index, int mod)
+{
+ *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_bf16f32_16x16x64_bf16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_i32_16x16x128_iu8(global v8i* out, v8i a, v16i b, v8i c, int index, int mod)
+{
+ *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_i32_16x16x128_iu8(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_i32_16x16x128_iu8' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_f32_16x16x64_f16(global v8f* out, v16h a, v32h b, v8f c, int index, int mod)
+{
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f32_16x16x64_f16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_f32_16x16x64_f16' must be a constant integer}}
+}
+
+void test_amdgcn_swmmac_f16_16x16x64_f16(global v8h* out, v16h a, v32h b, v8h c, int index, int mod)
+{
+ *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(mod, a, 0, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, mod, b, c, index, false, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, mod, false); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+ *out = __builtin_amdgcn_swmmac_f16_16x16x64_f16(0, a, 0, b, c, index, false, mod); // expected-error {{'__builtin_amdgcn_swmmac_f16_16x16x64_f16' must be a constant integer}}
+}
>From 16f76445dba1c7b362a1e06dd808e9874d1bac83 Mon Sep 17 00:00:00 2001
From: Changpeng Fang <changpeng.fang at amd.com>
Date: Tue, 15 Jul 2025 16:50:00 -0700
Subject: [PATCH 2/2] AMDGPU: Implement builtins for gfx1250 wmma instructions
Fix a format error
---
clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp | 2 +-
1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
index e1f9cbe7aea26..dcfdea648e93c 100644
--- a/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
+++ b/clang/lib/CodeGen/TargetBuiltins/AMDGPU.cpp
@@ -1090,7 +1090,7 @@ Value *CodeGenFunction::EmitAMDGPUBuiltinExpr(unsigned BuiltinID,
ArgsForMatchingMatrixTypes = {3, 0};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f16_16x16x128_bf8_bf8;
break;
- case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
+ case AMDGPU::BI__builtin_amdgcn_wmma_f32_16x16x128_fp8_fp8:
ArgsForMatchingMatrixTypes = {3, 0};
BuiltinWMMAOp = Intrinsic::amdgcn_wmma_f32_16x16x128_fp8_fp8;
break;
More information about the cfe-commits
mailing list