[llvm-branch-commits] [clang] [llvm] AMDGPU: Add support for v_dot2_f32_bf16 instruction for gfx950 (PR #117597)
Matt Arsenault via llvm-branch-commits
llvm-branch-commits at lists.llvm.org
Mon Nov 25 10:12:09 PST 2024
https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/117597
v_dot2_f32_bf16 was added in gfx11 along with v_dot2_f16_f16 and v_dot2_bf16_bf16.
All three instructions were part of Dot9 instructions in the compiler.
This patch will split existing dot9 (v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16)
into new dot9 (v_dot2_f16_f16 and v_dot2_bf16_bf16), and dot12 (v_dot2_f32_bf16).
All necessary changes to gfx11 and gfx12 are updated to reflect this change.
Co-authored-by: Sirish Pande <Sirish.Pande at amd.com>
>From f221f63e40154aaf7f97acc3e48a8b7ba5659f8d Mon Sep 17 00:00:00 2001
From: Sirish Pande <Sirish.Pande at amd.com>
Date: Fri, 10 May 2024 17:33:59 -0500
Subject: [PATCH] AMDGPU: Add support for v_dot2_f32_bf16 instruction for
gfx950
v_dot2_f32_bf16 was added in gfx11 along with v_dot2_f16_f16 and v_dot2_bf16_bf16.
All three instructions were part of Dot9 instructions in the compiler.
This patch will split existing dot9 (v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16)
into new dot9 (v_dot2_f16_f16 and v_dot2_bf16_bf16), and dot12 (v_dot2_f32_bf16).
All necessary changes to gfx11 and gfx12 are updated to reflect this change.
Co-authored-by: Sirish Pande <Sirish.Pande at amd.com>
---
clang/include/clang/Basic/BuiltinsAMDGPU.def | 2 +-
clang/test/CodeGenOpenCL/amdgpu-features.cl | 24 +++----
.../builtins-amdgcn-dl-insts-err.cl | 4 +-
.../CodeGenOpenCL/builtins-amdgcn-gfx950.cl | 25 +++++++
llvm/lib/Target/AMDGPU/AMDGPU.td | 14 +++-
llvm/lib/Target/AMDGPU/GCNSubtarget.h | 5 ++
llvm/lib/Target/AMDGPU/VOP3PInstructions.td | 5 +-
llvm/lib/TargetParser/TargetParser.cpp | 3 +
.../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll | 68 +++++++++++++++++++
llvm/test/MC/AMDGPU/gfx950_dlops.s | 61 +++++++++++++++++
.../Disassembler/AMDGPU/gfx950_dasm_vop3.txt | 60 ++++++++++++++++
11 files changed, 253 insertions(+), 18 deletions(-)
create mode 100644 llvm/test/MC/AMDGPU/gfx950_dlops.s
diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fd449697e91216..7d0019eead96b6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -263,7 +263,7 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "gfx940
TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts")
TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts")
TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot9-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot9-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot12-insts")
TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index db7fd76ec91189..0b698035ee54c7 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -89,7 +89,7 @@
// GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
// GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
// GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
@@ -101,17 +101,17 @@
// GFX1034: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1035: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
// GFX1036: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
-// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
+// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
kernel void test() {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 5db280f339e713..f409d2a110d753 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -23,8 +23,8 @@ kernel void builtins_amdgcn_dl_insts_err(
sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC); // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot9-insts}}
- fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot9-insts}}
- fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot9-insts}}
+ fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}
+ fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}
siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true); // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 34eae17827ff7c..390d9a55e9b16d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -215,3 +215,28 @@ void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) {
void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
*out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
}
+
+// CHECK-LABEL: define dso_local void @builtins_amdgcn_dl_insts(
+// CHECK-SAME: ptr addrspace(1) noundef [[OUT:%.*]], float noundef [[FC:%.*]], <2 x i16> noundef [[V2SSA:%.*]], <2 x i16> noundef [[V2SSB:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT: entry:
+// CHECK-NEXT: [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT: [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT: [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT: store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store float [[FC]], ptr addrspace(5) [[FC_ADDR]], align 4
+// CHECK-NEXT: store <2 x i16> [[V2SSA]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
+// CHECK-NEXT: store <2 x i16> [[V2SSB]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
+// CHECK-NEXT: [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
+// CHECK-NEXT: [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
+// CHECK-NEXT: [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x bfloat>
+// CHECK-NEXT: [[TMP4:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4
+// CHECK-NEXT: [[TMP5:%.*]] = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[TMP1]], <2 x bfloat> [[TMP3]], float [[TMP4]], i1 false)
+// CHECK-NEXT: [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT: store float [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
+// CHECK-NEXT: ret void
+//
+void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) {
+ *out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 434a6ea4f1b41b..90340f05dab24a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -732,7 +732,7 @@ def FeatureDot8Insts : SubtargetFeature<"dot8-insts",
def FeatureDot9Insts : SubtargetFeature<"dot9-insts",
"HasDot9Insts",
"true",
- "Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16 instructions"
+ "Has v_dot2_f16_f16, v_dot2_bf16_bf16 instructions"
>;
def FeatureDot10Insts : SubtargetFeature<"dot10-insts",
@@ -747,6 +747,12 @@ def FeatureDot11Insts : SubtargetFeature<"dot11-insts",
"Has v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8, v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8 instructions"
>;
+def FeatureDot12Insts : SubtargetFeature<"dot12-insts",
+ "HasDot12Insts",
+ "true",
+ "Has v_dot2_f32_bf16 instructions"
+>;
+
def FeatureMAIInsts : SubtargetFeature<"mai-insts",
"HasMAIInsts",
"true",
@@ -1579,6 +1585,7 @@ def FeatureISAVersion9_5_Common : FeatureSet<
FeatureBF8ConversionScaleInsts,
FeatureFP4ConversionScaleInsts,
FeatureFP6BF6ConversionScaleInsts,
+ FeatureDot12Insts,
FeatureFP8Insts
])>;
@@ -1707,6 +1714,7 @@ def FeatureISAVersion11_Common : FeatureSet<
FeatureDot8Insts,
FeatureDot9Insts,
FeatureDot10Insts,
+ FeatureDot12Insts,
FeatureNSAEncoding,
FeaturePartialNSAEncoding,
FeatureShaderCyclesRegister,
@@ -1790,6 +1798,7 @@ def FeatureISAVersion12 : FeatureSet<
FeatureDot9Insts,
FeatureDot10Insts,
FeatureDot11Insts,
+ FeatureDot12Insts,
FeatureNSAEncoding,
FeaturePartialNSAEncoding,
FeatureShaderCyclesHiLoRegisters,
@@ -2362,6 +2371,9 @@ def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">,
def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">,
AssemblerPredicate<(all_of FeatureDot11Insts)>;
+def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">,
+ AssemblerPredicate<(all_of FeatureDot12Insts)>;
+
def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 390849dd2e0564..7e994b84426bf0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -156,6 +156,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
bool HasDot9Insts = false;
bool HasDot10Insts = false;
bool HasDot11Insts = false;
+ bool HasDot12Insts = false;
bool HasMAIInsts = false;
bool HasFP8Insts = false;
bool HasFP8ConversionInsts = false;
@@ -825,6 +826,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
return HasDot11Insts;
}
+ bool hasDot12Insts() const {
+ return HasDot12Insts;
+ }
+
bool hasMAIInsts() const {
return HasMAIInsts;
}
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 5d8dc5ccd18e55..ee68eb32d9173a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -406,12 +406,12 @@ def DOT2_BF16_Profile
let HasSrc1Mods = 1;
}
-let SubtargetPredicate = HasDot9Insts in {
+let SubtargetPredicate = HasDot12Insts in {
defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile,
int_amdgcn_fdot2_f32_bf16, 1>;
-} // End SubtargetPredicate = HasDot9Insts
+} // End SubtargetPredicate = HasDot12Insts
} // End let IsDOT = 1
@@ -2118,6 +2118,7 @@ defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mf
defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2d>;
defm V_MFMA_F32_32X32X64_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2e, "v_mfma_f32_32x32x64_f8f6f4">;
defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2e>;
+defm V_DOT2_F32_BF16 : VOP3P_Real_vi<0x1a>;
defm V_MFMA_I32_32X32X16I8 : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
defm V_MFMA_I32_16X16X32I8 : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 23532b9214a892..110b94a57b802c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -374,6 +374,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["dot9-insts"] = true;
Features["dot10-insts"] = true;
Features["dot11-insts"] = true;
+ Features["dot12-insts"] = true;
Features["dl-insts"] = true;
Features["atomic-ds-pk-add-16-insts"] = true;
Features["atomic-flat-pk-add-16-insts"] = true;
@@ -406,6 +407,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["dot8-insts"] = true;
Features["dot9-insts"] = true;
Features["dot10-insts"] = true;
+ Features["dot12-insts"] = true;
Features["dl-insts"] = true;
Features["16-bit-insts"] = true;
Features["dpp"] = true;
@@ -475,6 +477,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
Features["permlane16-swap"] = true;
Features["permlane32-swap"] = true;
Features["ashr-pk-insts"] = true;
+ Features["dot12-insts"] = true;
Features["gfx950-insts"] = true;
[[fallthrough]];
case GK_GFX942:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index b0ef568fbdce31..fca418e3f82002 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -1,6 +1,8 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
@@ -18,6 +20,39 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0
+; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0
+; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, s0
+; GFX950-NEXT: v_mov_b32_e32 v2, s1
+; GFX950-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2 clamp
+; GFX950-NEXT: s_nop 2
+; GFX950-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX950-NEXT: s_endpgm
+;
+; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
+; GFX950-ISEL: ; %bb.0: ; %entry
+; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
+; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
+; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
+; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1 clamp
+; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-ISEL-NEXT: s_nop 1
+; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX950-ISEL-NEXT: s_endpgm
+
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
@@ -46,6 +81,39 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0
; GFX11-NEXT: global_store_b32 v1, v0, s[0:1]
; GFX11-NEXT: s_endpgm
+;
+; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
+; GFX950: ; %bb.0: ; %entry
+; GFX950-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-NEXT: v_mov_b32_e32 v0, 0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: s_load_dword s0, s[12:13], 0x0
+; GFX950-NEXT: s_load_dword s1, s[14:15], 0x0
+; GFX950-NEXT: s_load_dword s2, s[10:11], 0x0
+; GFX950-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-NEXT: v_mov_b32_e32 v1, s0
+; GFX950-NEXT: v_mov_b32_e32 v2, s1
+; GFX950-NEXT: v_dot2_f32_bf16 v1, s2, v1, v2
+; GFX950-NEXT: s_nop 2
+; GFX950-NEXT: global_store_dword v0, v1, s[8:9]
+; GFX950-NEXT: s_endpgm
+;
+; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
+; GFX950-ISEL: ; %bb.0: ; %entry
+; GFX950-ISEL-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT: s_load_dword s0, s[12:13], 0x0
+; GFX950-ISEL-NEXT: s_load_dword s1, s[14:15], 0x0
+; GFX950-ISEL-NEXT: s_load_dword s2, s[10:11], 0x0
+; GFX950-ISEL-NEXT: s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT: v_mov_b32_e32 v0, s0
+; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, s1
+; GFX950-ISEL-NEXT: v_dot2_f32_bf16 v0, s2, v0, v1
+; GFX950-ISEL-NEXT: v_mov_b32_e32 v1, 0
+; GFX950-ISEL-NEXT: s_nop 1
+; GFX950-ISEL-NEXT: global_store_dword v1, v0, s[8:9]
+; GFX950-ISEL-NEXT: s_endpgm
+
ptr addrspace(1) %r,
ptr addrspace(1) %a,
ptr addrspace(1) %b,
diff --git a/llvm/test/MC/AMDGPU/gfx950_dlops.s b/llvm/test/MC/AMDGPU/gfx950_dlops.s
new file mode 100644
index 00000000000000..4ae60ac785f496
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_dlops.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s
+
+v_dot2_f32_bf16 v5, v1, v2, v3
+// GFX950: v_dot2_f32_bf16 v5, v1, v2, v3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_bf16 v5, v1, v2, s3
+// GFX950: v_dot2_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18]
+
+v_dot2_f32_bf16 v2, v1, 0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 0.5, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -0.5, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 1.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -1.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 2.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -2.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 4.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -4.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 0.15915494, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, 0.5, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -0.5, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, 1.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -1.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, 2.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -2.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, 4.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -4.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
index ccc55352413777..ca8b1750a579e2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
@@ -780,6 +780,66 @@
# GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04
+# GFX950: v_dot2_f32_bf16 v5, v1, v2, v3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c
+
+# GFX950: v_dot2_f32_bf16 v5, v1, v2, s3 ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18]
+0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -0.5, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -1.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -2.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -4.0, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 0.15915494, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -0.5, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -1.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -2.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -4.0, v1, v2 ; encoding: [0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c
+
# GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04
More information about the llvm-branch-commits
mailing list