[llvm] [AMDGPU] Fix vector legalization for bf16 valu ops (PR #158439)

Sat Sep 13 13:59:23 PDT 2025

https://github.com/giuseros created https://github.com/llvm/llvm-project/pull/158439

Add v4,v8,v16,v32 legalizations for the following operations:
- `FADD`
- `FMUL`
- `FMA`
- `FCANONICALIZE`

>From 783302d49db678f446ce33806e6e6f1acc078e63 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini at amd.com>
Date: Sat, 13 Sep 2025 00:49:08 +0100
Subject: [PATCH] [AMDGPU] Fix vector legalization for bf16 valu ops

Add v4,v8,v16,v32 legalizations for the following operations:
- FADD
- FMUL
- FMA
- FCANONICALIZE
---
 .../AMDGPU/AMDGPUTargetTransformInfo.cpp      |   6 +-
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp     |   7 ++
 .../Analysis/CostModel/AMDGPU/canonicalize.ll |  68 +++++++++++-
 llvm/test/CodeGen/AMDGPU/bf16.ll              | 104 ++++++++++++++++++
 4 files changed, 178 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3e2b2c3510569..b07e936c494f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -607,6 +607,8 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
   case ISD::FSUB:
     if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
       NElts = (NElts + 1) / 2;
+    if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
+      NElts = (NElts + 1) / 2;
     if (SLT == MVT::f64)
       return LT.first * NElts * get64BitInstrCost(CostKind);
 
@@ -746,7 +748,9 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
   MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
 
-  if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
+  if ((ST->hasVOP3PInsts() &&
+       (SLT == MVT::f16 || SLT == MVT::i16 ||
+        (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
       (ST->hasPackedFP32Ops() && SLT == MVT::f32))
     NElts = (NElts + 1) / 2;
 
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cb3e544449bbf..761e235f39df5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -850,6 +850,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
     setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
                        Custom);
 
+    if (Subtarget->hasBF16PackedInsts()) {
+      for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
+        // Split vector operations.
+        setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
+                           VT, Custom);
+    }
+
     if (Subtarget->hasPackedFP32Ops()) {
       setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
                          MVT::v2f32, Legal);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
index 7ac4db3119210..904db9064a369 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -3,11 +3,13 @@
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250 %s
 
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
 ; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250-SIZE %s
 
 define void @canonicalize_f16() {
 ; BASE-LABEL: 'canonicalize_f16'
@@ -141,6 +143,16 @@ define void @canonicalize_bf16() {
 ; GFX10-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
 ; GFX10-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
+; GFX1250-LABEL: 'canonicalize_bf16'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
+;
 ; BASE-SIZE-LABEL: 'canonicalize_bf16'
 ; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
 ; BASE-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
@@ -181,6 +193,15 @@ define void @canonicalize_bf16() {
 ; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
 ; GFX10-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; GFX1250-SIZE-LABEL: 'canonicalize_bf16'
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction:   %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   ret void
   %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1
   %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1
   %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1
@@ -203,6 +224,17 @@ define void @canonicalize_f32() {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
+; GFX1250-LABEL: 'canonicalize_f32'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %f32 = call float @llvm.canonicalize.f32(float undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 15 for instruction:   %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 24 for instruction:   %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
+;
 ; ALL-SIZE-LABEL: 'canonicalize_f32'
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
@@ -214,6 +246,16 @@ define void @canonicalize_f32() {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
+; GFX1250-SIZE-LABEL: 'canonicalize_f32':
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %f32 = call float @llvm.canonicalize.f32(float undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction:   %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction:   %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction:   %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction:   %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction:   ret void
   %f32 = call float @llvm.canonicalize.f32(float undef) #1
   %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1
   %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1
@@ -236,6 +278,16 @@ define void @canonicalize_f64() {
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
 ; ALL-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: ret void
 ;
+; GFX1250-LABEL: 'canonicalize_f64'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction:   %f64 = call double @llvm.canonicalize.f64(double undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction:   %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 12 for instruction:   %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction:   %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction:   %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction:   %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 320 for instruction:   %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction:   ret void
+;
 ; ALL-SIZE-LABEL: 'canonicalize_f64'
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
@@ -245,6 +297,16 @@ define void @canonicalize_f64() {
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
 ; ALL-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX1250-SIZE-LABEL: 'canonicalize_f64'
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; GFX1250-SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret void
 ;
   %f64 = call double @llvm.canonicalize.f64(double undef) #1
   %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1
@@ -255,9 +317,3 @@ define void @canonicalize_f64() {
   %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1
   ret void
 }
-
-
-
-
-
-
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 44c719f3635c8..84a4440e03977 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -7,6 +7,7 @@
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
 ; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250
 
 define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
 ; GCN-LABEL: test_load_store:
@@ -9859,6 +9860,12 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v1
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %op = fadd <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
 }
@@ -10383,6 +10390,13 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v2
+; GFX1250-NEXT:   v_pk_add_bf16 v1, v1, v3
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %op = fadd <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
 }
@@ -10921,6 +10935,15 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v8bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_add_bf16 v0, v0, v4
+; GFX1250-NEXT:    v_pk_add_bf16 v1, v1, v5
+; GFX1250-NEXT:    v_pk_add_bf16 v2, v2, v6
+; GFX1250-NEXT:    v_pk_add_bf16 v3, v3, v7
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = fadd <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
 }
@@ -14147,6 +14170,29 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_lshrrev_b32_e32 v0, 16, v0
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v32bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   scratch_load_b32 v31, off, s32
+; GFX1250-NEXT:   v_pk_add_bf16 v0, v0, v16
+; GFX1250-NEXT:   v_pk_add_bf16 v1, v1, v17
+; GFX1250-NEXT:   v_pk_add_bf16 v2, v2, v18
+; GFX1250-NEXT:   v_pk_add_bf16 v3, v3, v19
+; GFX1250-NEXT:   v_pk_add_bf16 v4, v4, v20
+; GFX1250-NEXT:   v_pk_add_bf16 v5, v5, v21
+; GFX1250-NEXT:   v_pk_add_bf16 v6, v6, v22
+; GFX1250-NEXT:   v_pk_add_bf16 v7, v7, v23
+; GFX1250-NEXT:   v_pk_add_bf16 v8, v8, v24
+; GFX1250-NEXT:   v_pk_add_bf16 v9, v9, v25
+; GFX1250-NEXT:   v_pk_add_bf16 v10, v10, v26
+; GFX1250-NEXT:   v_pk_add_bf16 v11, v11, v27
+; GFX1250-NEXT:   v_pk_add_bf16 v12, v12, v28
+; GFX1250-NEXT:   v_pk_add_bf16 v13, v13, v29
+; GFX1250-NEXT:   v_pk_add_bf16 v14, v14, v30
+; GFX1250-NEXT:   s_wait_loadcnt 0x0
+; GFX1250-NEXT:   v_pk_add_bf16 v15, v15, v31
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %add = fadd bfloat %arg0, 1.0
   ret bfloat %add
 }
@@ -15351,6 +15397,12 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v0, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_mul_bf16 v0, v0, v1
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = fmul <2 x bfloat> %a, %b
   ret <2 x bfloat> %op
 }
@@ -15875,6 +15927,13 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v1, v4, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   v_pk_mul_bf16 v0, v0, v2
+; GFX1250-NEXT:   v_pk_mul_bf16 v1, v1, v3
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %op = fmul <4 x bfloat> %a, %b
   ret <4 x bfloat> %op
 }
@@ -16413,6 +16472,15 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v3, v3, v8, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v8bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   v_pk_mul_bf16 v0, v0, v4
+; GFX1250-NEXT:   v_pk_mul_bf16 v1, v1, v5
+; GFX1250-NEXT:   v_pk_mul_bf16 v2, v2, v6
+; GFX1250-NEXT:   v_pk_mul_bf16 v3, v3, v7
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %op = fmul <8 x bfloat> %a, %b
   ret <8 x bfloat> %op
 }
@@ -19535,6 +19603,29 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v15, v15, v17, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v32bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:   s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:   s_wait_kmcnt 0x0
+; GFX1250-NEXT:   scratch_load_b32 v31, off, s32
+; GFX1250-NEXT:   v_pk_mul_bf16 v0, v0, v16
+; GFX1250-NEXT:   v_pk_mul_bf16 v1, v1, v17
+; GFX1250-NEXT:   v_pk_mul_bf16 v2, v2, v18
+; GFX1250-NEXT:   v_pk_mul_bf16 v3, v3, v19
+; GFX1250-NEXT:   v_pk_mul_bf16 v4, v4, v20
+; GFX1250-NEXT:   v_pk_mul_bf16 v5, v5, v21
+; GFX1250-NEXT:   v_pk_mul_bf16 v6, v6, v22
+; GFX1250-NEXT:   v_pk_mul_bf16 v7, v7, v23
+; GFX1250-NEXT:   v_pk_mul_bf16 v8, v8, v24
+; GFX1250-NEXT:   v_pk_mul_bf16 v9, v9, v25
+; GFX1250-NEXT:   v_pk_mul_bf16 v10, v10, v26
+; GFX1250-NEXT:   v_pk_mul_bf16 v11, v11, v27
+; GFX1250-NEXT:   v_pk_mul_bf16 v12, v12, v28
+; GFX1250-NEXT:   v_pk_mul_bf16 v13, v13, v29
+; GFX1250-NEXT:   v_pk_mul_bf16 v14, v14, v30
+; GFX1250-NEXT:   s_wait_loadcnt 0x0
+; GFX1250-NEXT:   v_pk_mul_bf16 v15, v15, v31
+; GFX1250-NEXT:   s_set_pc_i64 s[30:31]
   %op = fmul <32 x bfloat> %a, %b
   ret <32 x bfloat> %op
 }
@@ -46178,6 +46269,12 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v0, v1, v0, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fma_v2bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v1, v2
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
   ret <2 x bfloat> %op
 }
@@ -46780,6 +46877,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
 ; GFX11FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11FAKE16-NEXT:    v_perm_b32 v1, v4, v1, 0x7060302
 ; GFX11FAKE16-NEXT:    s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fma_v4bf16:
+; GFX1250:       ; %bb.0:
+; GFX1250-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT:    s_wait_kmcnt 0x0
+; GFX1250-NEXT:    v_pk_fma_bf16 v0, v0, v2, v4
+; GFX1250-NEXT:    v_pk_fma_bf16 v1, v1, v3, v5
+; GFX1250-NEXT:    s_set_pc_i64 s[30:31]
   %op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
   ret <4 x bfloat> %op
 }