[llvm-branch-commits] [clang] [llvm] AMDGPU: Add support for v_dot2_f32_bf16 instruction for gfx950 (PR #117597)

Matt Arsenault via llvm-branch-commits llvm-branch-commits at lists.llvm.org
Mon Nov 25 10:12:09 PST 2024


https://github.com/arsenm created https://github.com/llvm/llvm-project/pull/117597

v_dot2_f32_bf16 was added in gfx11 along with v_dot2_f16_f16 and v_dot2_bf16_bf16.
All three instructions were part of Dot9 instructions in the compiler.

This patch will split existing dot9 (v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16)
into new dot9 (v_dot2_f16_f16 and v_dot2_bf16_bf16), and dot12 (v_dot2_f32_bf16).

All necessary changes to gfx11 and gfx12 are updated to reflect this change.

Co-authored-by: Sirish Pande <Sirish.Pande at amd.com>

>From f221f63e40154aaf7f97acc3e48a8b7ba5659f8d Mon Sep 17 00:00:00 2001
From: Sirish Pande <Sirish.Pande at amd.com>
Date: Fri, 10 May 2024 17:33:59 -0500
Subject: [PATCH] AMDGPU: Add support for v_dot2_f32_bf16 instruction for
 gfx950

v_dot2_f32_bf16 was added in gfx11 along with v_dot2_f16_f16 and v_dot2_bf16_bf16.
All three instructions were part of Dot9 instructions in the compiler.

This patch will split existing dot9 (v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16)
into new dot9 (v_dot2_f16_f16 and v_dot2_bf16_bf16), and dot12 (v_dot2_f32_bf16).

All necessary changes to gfx11 and gfx12 are updated to reflect this change.

Co-authored-by: Sirish Pande <Sirish.Pande at amd.com>
---
 clang/include/clang/Basic/BuiltinsAMDGPU.def  |  2 +-
 clang/test/CodeGenOpenCL/amdgpu-features.cl   | 24 +++----
 .../builtins-amdgcn-dl-insts-err.cl           |  4 +-
 .../CodeGenOpenCL/builtins-amdgcn-gfx950.cl   | 25 +++++++
 llvm/lib/Target/AMDGPU/AMDGPU.td              | 14 +++-
 llvm/lib/Target/AMDGPU/GCNSubtarget.h         |  5 ++
 llvm/lib/Target/AMDGPU/VOP3PInstructions.td   |  5 +-
 llvm/lib/TargetParser/TargetParser.cpp        |  3 +
 .../AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll      | 68 +++++++++++++++++++
 llvm/test/MC/AMDGPU/gfx950_dlops.s            | 61 +++++++++++++++++
 .../Disassembler/AMDGPU/gfx950_dasm_vop3.txt  | 60 ++++++++++++++++
 11 files changed, 253 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/MC/AMDGPU/gfx950_dlops.s

diff --git a/clang/include/clang/Basic/BuiltinsAMDGPU.def b/clang/include/clang/Basic/BuiltinsAMDGPU.def
index fd449697e91216..7d0019eead96b6 100644
--- a/clang/include/clang/Basic/BuiltinsAMDGPU.def
+++ b/clang/include/clang/Basic/BuiltinsAMDGPU.def
@@ -263,7 +263,7 @@ TARGET_BUILTIN(__builtin_amdgcn_global_load_lds, "vv*1v*3IUiIiIUi", "t", "gfx940
 TARGET_BUILTIN(__builtin_amdgcn_fdot2, "fV2hV2hfIb", "nc", "dot10-insts")
 TARGET_BUILTIN(__builtin_amdgcn_fdot2_f16_f16, "hV2hV2hh", "nc", "dot9-insts")
 TARGET_BUILTIN(__builtin_amdgcn_fdot2_bf16_bf16, "sV2sV2ss", "nc", "dot9-insts")
-TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot9-insts")
+TARGET_BUILTIN(__builtin_amdgcn_fdot2_f32_bf16, "fV2sV2sfIb", "nc", "dot12-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot2, "SiV2SsV2SsSiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_udot2, "UiV2UsV2UsUiIb", "nc", "dot2-insts")
 TARGET_BUILTIN(__builtin_amdgcn_sdot4, "SiSiSiSiIb", "nc", "dot1-insts")
diff --git a/clang/test/CodeGenOpenCL/amdgpu-features.cl b/clang/test/CodeGenOpenCL/amdgpu-features.cl
index db7fd76ec91189..0b698035ee54c7 100644
--- a/clang/test/CodeGenOpenCL/amdgpu-features.cl
+++ b/clang/test/CodeGenOpenCL/amdgpu-features.cl
@@ -89,7 +89,7 @@
 // GFX941: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 // GFX942: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64,+xf32-insts"
 // GFX9_4_Generic: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+mai-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
-// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
+// GFX950: "target-features"="+16-bit-insts,+ashr-pk-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot12-insts,+dot2-insts,+dot3-insts,+dot4-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+f16bf16-to-fp6bf6-cvt-scale-insts,+fp8-conversion-insts,+fp8-insts,+gfx8-insts,+gfx9-insts,+gfx90a-insts,+gfx940-insts,+gfx950-insts,+mai-insts,+permlane16-swap,+permlane32-swap,+prng-inst,+s-memrealtime,+s-memtime-inst,+wavefrontsize64"
 // GFX1010: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1011: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1012: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
@@ -101,17 +101,17 @@
 // GFX1034: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1035: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
 // GFX1036: "target-features"="+16-bit-insts,+ci-insts,+dl-insts,+dot1-insts,+dot10-insts,+dot2-insts,+dot5-insts,+dot6-insts,+dot7-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx8-insts,+gfx9-insts,+s-memrealtime,+s-memtime-inst,+wavefrontsize32"
-// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
-// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1100: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1101: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1102: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1103: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1150: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1151: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1152: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1153: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1200: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
+// GFX1201: "target-features"="+16-bit-insts,+atomic-buffer-global-pk-add-f16-insts,+atomic-ds-pk-add-16-insts,+atomic-fadd-rtn-insts,+atomic-flat-pk-add-16-insts,+atomic-global-pk-add-bf16-inst,+ci-insts,+dl-insts,+dot10-insts,+dot11-insts,+dot12-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+fp8-conversion-insts,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx12-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize32"
 
-// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
+// GFX1103-W64: "target-features"="+16-bit-insts,+atomic-fadd-rtn-insts,+ci-insts,+dl-insts,+dot10-insts,+dot12-insts,+dot5-insts,+dot7-insts,+dot8-insts,+dot9-insts,+dpp,+gfx10-3-insts,+gfx10-insts,+gfx11-insts,+gfx8-insts,+gfx9-insts,+wavefrontsize64"
 
 kernel void test() {}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
index 5db280f339e713..f409d2a110d753 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-dl-insts-err.cl
@@ -23,8 +23,8 @@ kernel void builtins_amdgcn_dl_insts_err(
 
   sOut[0] = __builtin_amdgcn_fdot2_bf16_bf16(v2ssA, v2ssB, sC);       // expected-error {{'__builtin_amdgcn_fdot2_bf16_bf16' needs target feature dot9-insts}}
 
-  fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot9-insts}}
-  fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true);  // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot9-insts}}
+  fOut[3] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false); // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}
+  fOut[4] = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, true);  // expected-error {{'__builtin_amdgcn_fdot2_f32_bf16' needs target feature dot12-insts}}
 
   siOut[0] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, false);      // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
   siOut[1] = __builtin_amdgcn_sdot2(v2ssA, v2ssB, siC, true);       // expected-error {{'__builtin_amdgcn_sdot2' needs target feature dot2-insts}}
diff --git a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
index 34eae17827ff7c..390d9a55e9b16d 100644
--- a/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
+++ b/clang/test/CodeGenOpenCL/builtins-amdgcn-gfx950.cl
@@ -215,3 +215,28 @@ void test_ashr_pk_i8_i32(global int* out, uint src0, uint src1, uint src2) {
 void test_ashr_pk_u8_i32(global int* out, uint src0, uint src1, uint src2) {
   *out = __builtin_amdgcn_ashr_pk_u8_i32(src0, src1, src2);
 }
+
+// CHECK-LABEL: define dso_local void @builtins_amdgcn_dl_insts(
+// CHECK-SAME: ptr addrspace(1) noundef [[OUT:%.*]], float noundef [[FC:%.*]], <2 x i16> noundef [[V2SSA:%.*]], <2 x i16> noundef [[V2SSB:%.*]]) #[[ATTR0:[0-9]+]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// CHECK-NEXT:    [[FC_ADDR:%.*]] = alloca float, align 4, addrspace(5)
+// CHECK-NEXT:    [[V2SSA_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    [[V2SSB_ADDR:%.*]] = alloca <2 x i16>, align 4, addrspace(5)
+// CHECK-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store float [[FC]], ptr addrspace(5) [[FC_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[V2SSA]], ptr addrspace(5) [[V2SSA_ADDR]], align 4
+// CHECK-NEXT:    store <2 x i16> [[V2SSB]], ptr addrspace(5) [[V2SSB_ADDR]], align 4
+// CHECK-NEXT:    [[TMP0:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSA_ADDR]], align 4
+// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <2 x i16> [[TMP0]] to <2 x bfloat>
+// CHECK-NEXT:    [[TMP2:%.*]] = load <2 x i16>, ptr addrspace(5) [[V2SSB_ADDR]], align 4
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <2 x i16> [[TMP2]] to <2 x bfloat>
+// CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr addrspace(5) [[FC_ADDR]], align 4
+// CHECK-NEXT:    [[TMP5:%.*]] = call float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> [[TMP1]], <2 x bfloat> [[TMP3]], float [[TMP4]], i1 false)
+// CHECK-NEXT:    [[TMP6:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// CHECK-NEXT:    store float [[TMP5]], ptr addrspace(1) [[TMP6]], align 4
+// CHECK-NEXT:    ret void
+//
+void builtins_amdgcn_dl_insts(global float *out, float fC, short2 v2ssA, short2 v2ssB) {
+  *out = __builtin_amdgcn_fdot2_f32_bf16(v2ssA, v2ssB, fC, false);
+}
diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.td b/llvm/lib/Target/AMDGPU/AMDGPU.td
index 434a6ea4f1b41b..90340f05dab24a 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPU.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPU.td
@@ -732,7 +732,7 @@ def FeatureDot8Insts : SubtargetFeature<"dot8-insts",
 def FeatureDot9Insts : SubtargetFeature<"dot9-insts",
   "HasDot9Insts",
   "true",
-  "Has v_dot2_f16_f16, v_dot2_bf16_bf16, v_dot2_f32_bf16 instructions"
+  "Has v_dot2_f16_f16, v_dot2_bf16_bf16 instructions"
 >;
 
 def FeatureDot10Insts : SubtargetFeature<"dot10-insts",
@@ -747,6 +747,12 @@ def FeatureDot11Insts : SubtargetFeature<"dot11-insts",
   "Has v_dot4_f32_fp8_fp8, v_dot4_f32_fp8_bf8, v_dot4_f32_bf8_fp8, v_dot4_f32_bf8_bf8 instructions"
 >;
 
+def FeatureDot12Insts : SubtargetFeature<"dot12-insts",
+  "HasDot12Insts",
+  "true",
+  "Has v_dot2_f32_bf16 instructions"
+>;
+
 def FeatureMAIInsts : SubtargetFeature<"mai-insts",
   "HasMAIInsts",
   "true",
@@ -1579,6 +1585,7 @@ def FeatureISAVersion9_5_Common : FeatureSet<
    FeatureBF8ConversionScaleInsts,
    FeatureFP4ConversionScaleInsts,
    FeatureFP6BF6ConversionScaleInsts,
+   FeatureDot12Insts,
    FeatureFP8Insts
    ])>;
 
@@ -1707,6 +1714,7 @@ def FeatureISAVersion11_Common : FeatureSet<
    FeatureDot8Insts,
    FeatureDot9Insts,
    FeatureDot10Insts,
+   FeatureDot12Insts,
    FeatureNSAEncoding,
    FeaturePartialNSAEncoding,
    FeatureShaderCyclesRegister,
@@ -1790,6 +1798,7 @@ def FeatureISAVersion12 : FeatureSet<
    FeatureDot9Insts,
    FeatureDot10Insts,
    FeatureDot11Insts,
+   FeatureDot12Insts,
    FeatureNSAEncoding,
    FeaturePartialNSAEncoding,
    FeatureShaderCyclesHiLoRegisters,
@@ -2362,6 +2371,9 @@ def HasDot10Insts : Predicate<"Subtarget->hasDot10Insts()">,
 def HasDot11Insts : Predicate<"Subtarget->hasDot11Insts()">,
   AssemblerPredicate<(all_of FeatureDot11Insts)>;
 
+def HasDot12Insts : Predicate<"Subtarget->hasDot12Insts()">,
+  AssemblerPredicate<(all_of FeatureDot12Insts)>;
+
 def HasGetWaveIdInst : Predicate<"Subtarget->hasGetWaveIdInst()">,
   AssemblerPredicate<(all_of FeatureGetWaveIdInst)>;
 
diff --git a/llvm/lib/Target/AMDGPU/GCNSubtarget.h b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
index 390849dd2e0564..7e994b84426bf0 100644
--- a/llvm/lib/Target/AMDGPU/GCNSubtarget.h
+++ b/llvm/lib/Target/AMDGPU/GCNSubtarget.h
@@ -156,6 +156,7 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
   bool HasDot9Insts = false;
   bool HasDot10Insts = false;
   bool HasDot11Insts = false;
+  bool HasDot12Insts = false;
   bool HasMAIInsts = false;
   bool HasFP8Insts = false;
   bool HasFP8ConversionInsts = false;
@@ -825,6 +826,10 @@ class GCNSubtarget final : public AMDGPUGenSubtargetInfo,
     return HasDot11Insts;
   }
 
+  bool hasDot12Insts() const {
+    return HasDot12Insts;
+  }
+
   bool hasMAIInsts() const {
     return HasMAIInsts;
   }
diff --git a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
index 5d8dc5ccd18e55..ee68eb32d9173a 100644
--- a/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3PInstructions.td
@@ -406,12 +406,12 @@ def DOT2_BF16_Profile
   let HasSrc1Mods = 1;
 }
 
-let SubtargetPredicate = HasDot9Insts  in {
+let SubtargetPredicate = HasDot12Insts  in {
 
 defm V_DOT2_F32_BF16 : VOP3PInst<"v_dot2_f32_bf16", DOT2_BF16_Profile,
   int_amdgcn_fdot2_f32_bf16, 1>;
 
-} // End SubtargetPredicate = HasDot9Insts
+} // End SubtargetPredicate = HasDot12Insts
 
 } // End let IsDOT = 1
 
@@ -2118,6 +2118,7 @@ defm V_MFMA_F32_16X16X128_F8F6F4 : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2d, "v_mf
 defm V_MFMA_SCALE_F32_16X16X128_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2d>;
 defm V_MFMA_F32_32X32X64_F8F6F4  : VOP3P_Real_MFMA_F8F6F4_gfx950_mc <0x2e, "v_mfma_f32_32x32x64_f8f6f4">;
 defm V_MFMA_SCALE_F32_32X32X64_F8F6F4 : VOP3PX_Real_ScaledMFMA_F8F6F4_mc <0x2e>;
+defm V_DOT2_F32_BF16 : VOP3P_Real_vi<0x1a>;
 
 defm V_MFMA_I32_32X32X16I8       : VOP3P_Real_MFMA_gfx940 <0x56, "v_mfma_i32_32x32x16_i8">;
 defm V_MFMA_I32_16X16X32I8       : VOP3P_Real_MFMA_gfx940 <0x57, "v_mfma_i32_16x16x32_i8">;
diff --git a/llvm/lib/TargetParser/TargetParser.cpp b/llvm/lib/TargetParser/TargetParser.cpp
index 23532b9214a892..110b94a57b802c 100644
--- a/llvm/lib/TargetParser/TargetParser.cpp
+++ b/llvm/lib/TargetParser/TargetParser.cpp
@@ -374,6 +374,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["dot9-insts"] = true;
       Features["dot10-insts"] = true;
       Features["dot11-insts"] = true;
+      Features["dot12-insts"] = true;
       Features["dl-insts"] = true;
       Features["atomic-ds-pk-add-16-insts"] = true;
       Features["atomic-flat-pk-add-16-insts"] = true;
@@ -406,6 +407,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["dot8-insts"] = true;
       Features["dot9-insts"] = true;
       Features["dot10-insts"] = true;
+      Features["dot12-insts"] = true;
       Features["dl-insts"] = true;
       Features["16-bit-insts"] = true;
       Features["dpp"] = true;
@@ -475,6 +477,7 @@ void AMDGPU::fillAMDGPUFeatureMap(StringRef GPU, const Triple &T,
       Features["permlane16-swap"] = true;
       Features["permlane32-swap"] = true;
       Features["ashr-pk-insts"] = true;
+      Features["dot12-insts"] = true;
       Features["gfx950-insts"] = true;
       [[fallthrough]];
     case GK_GFX942:
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
index b0ef568fbdce31..fca418e3f82002 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
 ; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX11
+; RUN: llc -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950
+; RUN: llc -global-isel -mtriple=amdgcn -mcpu=gfx950 -verify-machineinstrs < %s | FileCheck %s --check-prefixes=GFX950-ISEL
 
 declare float @llvm.amdgcn.fdot2.f32.bf16(<2 x bfloat> %a, <2 x bfloat> %b, float %c, i1 %clamp)
 
@@ -18,6 +20,39 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_clamp(
 ; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0 clamp
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_load_dword s0, s[12:13], 0x0
+; GFX950-NEXT:    s_load_dword s1, s[14:15], 0x0
+; GFX950-NEXT:    s_load_dword s2, s[10:11], 0x0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, s0
+; GFX950-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-NEXT:    v_dot2_f32_bf16 v1, s2, v1, v2 clamp
+; GFX950-NEXT:    s_nop 2
+; GFX950-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX950-NEXT:    s_endpgm
+;
+; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp:
+; GFX950-ISEL:       ; %bb.0: ; %entry
+; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
+; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
+; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
+; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX950-ISEL-NEXT:    v_dot2_f32_bf16 v0, s2, v0, v1 clamp
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX950-ISEL-NEXT:    s_nop 1
+; GFX950-ISEL-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX950-ISEL-NEXT:    s_endpgm
+
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
@@ -46,6 +81,39 @@ define amdgpu_kernel void @test_llvm_amdgcn_fdot2_f32_bf16_no_clamp(
 ; GFX11-NEXT:    v_dot2_f32_bf16 v0, s2, s3, v0
 ; GFX11-NEXT:    global_store_b32 v1, v0, s[0:1]
 ; GFX11-NEXT:    s_endpgm
+;
+; GFX950-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
+; GFX950:       ; %bb.0: ; %entry
+; GFX950-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-NEXT:    v_mov_b32_e32 v0, 0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    s_load_dword s0, s[12:13], 0x0
+; GFX950-NEXT:    s_load_dword s1, s[14:15], 0x0
+; GFX950-NEXT:    s_load_dword s2, s[10:11], 0x0
+; GFX950-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-NEXT:    v_mov_b32_e32 v1, s0
+; GFX950-NEXT:    v_mov_b32_e32 v2, s1
+; GFX950-NEXT:    v_dot2_f32_bf16 v1, s2, v1, v2
+; GFX950-NEXT:    s_nop 2
+; GFX950-NEXT:    global_store_dword v0, v1, s[8:9]
+; GFX950-NEXT:    s_endpgm
+;
+; GFX950-ISEL-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp:
+; GFX950-ISEL:       ; %bb.0: ; %entry
+; GFX950-ISEL-NEXT:    s_load_dwordx8 s[8:15], s[4:5], 0x24
+; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT:    s_load_dword s0, s[12:13], 0x0
+; GFX950-ISEL-NEXT:    s_load_dword s1, s[14:15], 0x0
+; GFX950-ISEL-NEXT:    s_load_dword s2, s[10:11], 0x0
+; GFX950-ISEL-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v0, s0
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, s1
+; GFX950-ISEL-NEXT:    v_dot2_f32_bf16 v0, s2, v0, v1
+; GFX950-ISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GFX950-ISEL-NEXT:    s_nop 1
+; GFX950-ISEL-NEXT:    global_store_dword v1, v0, s[8:9]
+; GFX950-ISEL-NEXT:    s_endpgm
+
     ptr addrspace(1) %r,
     ptr addrspace(1) %a,
     ptr addrspace(1) %b,
diff --git a/llvm/test/MC/AMDGPU/gfx950_dlops.s b/llvm/test/MC/AMDGPU/gfx950_dlops.s
new file mode 100644
index 00000000000000..4ae60ac785f496
--- /dev/null
+++ b/llvm/test/MC/AMDGPU/gfx950_dlops.s
@@ -0,0 +1,61 @@
+// RUN: llvm-mc -triple=amdgcn -mcpu=gfx950 -show-encoding %s | FileCheck --check-prefix=GFX950 %s
+
+v_dot2_f32_bf16 v5, v1, v2, v3
+// GFX950: v_dot2_f32_bf16 v5, v1, v2, v3          ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c]
+
+v_dot2_f32_bf16 v5, v1, v2, s3
+// GFX950: v_dot2_f32_bf16 v5, v1, v2, s3          ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18]
+
+v_dot2_f32_bf16 v2, v1, 0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 0, v2           ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 0.5, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 0.5, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -0.5, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -0.5, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 1.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 1.0, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -1.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -1.0, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 2.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 2.0, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -2.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -2.0, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 4.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 4.0, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, -4.0, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, -4.0, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, v1, 0.15915494, v2
+// GFX950: v_dot2_f32_bf16 v2, v1, 0.15915494, v2  ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c]
+
+v_dot2_f32_bf16 v2, 0.5, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 0.5, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -0.5, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -0.5, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, 1.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 1.0, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -1.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -1.0, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, 2.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 2.0, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -2.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -2.0, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, 4.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, 4.0, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c]
+
+v_dot2_f32_bf16 v2, -4.0, v1, v2
+// GFX950: v_dot2_f32_bf16 v2, -4.0, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
index ccc55352413777..ca8b1750a579e2 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx950_dasm_vop3.txt
@@ -780,6 +780,66 @@
 # GFX950: v_ashr_pk_u8_i32 v1, v2, v3, v4 op_sel:[0,0,0,1] ; encoding: [0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04]
 0x01,0x40,0x66,0xd2,0x02,0x07,0x12,0x04
 
+# GFX950: v_dot2_f32_bf16 v5, v1, v2, v3          ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c]
+0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x1c
+
+# GFX950: v_dot2_f32_bf16 v5, v1, v2, s3          ; encoding: [0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18]
+0x05,0x40,0x9a,0xd3,0x01,0x05,0x0e,0x18
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 0, v2           ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0x01,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 0.5, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe1,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -0.5, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe3,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 1.0, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe5,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -1.0, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe7,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 2.0, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xe9,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -2.0, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xeb,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 4.0, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xed,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, -4.0, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xef,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, v1, 0.15915494, v2  ; encoding: [0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c]
+0x02,0x40,0x9a,0xd3,0x01,0xf1,0x09,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 0.5, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf0,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -0.5, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf1,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 1.0, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf2,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -1.0, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf3,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 2.0, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf4,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -2.0, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf5,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, 4.0, v1, v2         ; encoding: [0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf6,0x02,0x0a,0x1c
+
+# GFX950: v_dot2_f32_bf16 v2, -4.0, v1, v2        ; encoding: [0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c]
+0x02,0x40,0x9a,0xd3,0xf7,0x02,0x0a,0x1c
+
 # GFX950: v_cvt_scalef32_2xpk16_fp6_f32 v[20:25], v[10:25], v[10:25], v6 ; encoding: [0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04]
 0x14,0x00,0x52,0xd2,0x0a,0x15,0x1a,0x04
 



More information about the llvm-branch-commits mailing list