[llvm] [AMDGPU] Fix vector legalization for bf16 valu ops (PR #158439)
Giuseppe Rossini via llvm-commits
llvm-commits at lists.llvm.org
Sat Sep 13 13:59:23 PDT 2025
https://github.com/giuseros created https://github.com/llvm/llvm-project/pull/158439
Add v4,v8,v16,v32 legalizations for the following operations:
- `FADD`
- `FMUL`
- `FMA`
- `FCANONICALIZE`
>From 783302d49db678f446ce33806e6e6f1acc078e63 Mon Sep 17 00:00:00 2001
From: Giuseppe Rossini <giuseppe.rossini at amd.com>
Date: Sat, 13 Sep 2025 00:49:08 +0100
Subject: [PATCH] [AMDGPU] Fix vector legalization for bf16 valu ops
Add v4,v8,v16,v32 legalizations for the following operations:
- FADD
- FMUL
- FMA
- FCANONICALIZE
---
.../AMDGPU/AMDGPUTargetTransformInfo.cpp | 6 +-
llvm/lib/Target/AMDGPU/SIISelLowering.cpp | 7 ++
.../Analysis/CostModel/AMDGPU/canonicalize.ll | 68 +++++++++++-
llvm/test/CodeGen/AMDGPU/bf16.ll | 104 ++++++++++++++++++
4 files changed, 178 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
index 3e2b2c3510569..b07e936c494f6 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetTransformInfo.cpp
@@ -607,6 +607,8 @@ InstructionCost GCNTTIImpl::getArithmeticInstrCost(
case ISD::FSUB:
if (ST->hasPackedFP32Ops() && SLT == MVT::f32)
NElts = (NElts + 1) / 2;
+ if (ST->hasBF16PackedInsts() && SLT == MVT::bf16)
+ NElts = (NElts + 1) / 2;
if (SLT == MVT::f64)
return LT.first * NElts * get64BitInstrCost(CostKind);
@@ -746,7 +748,9 @@ GCNTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
MVT::SimpleValueType SLT = LT.second.getScalarType().SimpleTy;
- if ((ST->hasVOP3PInsts() && (SLT == MVT::f16 || SLT == MVT::i16)) ||
+ if ((ST->hasVOP3PInsts() &&
+ (SLT == MVT::f16 || SLT == MVT::i16 ||
+ (SLT == MVT::bf16 && ST->hasBF16PackedInsts()))) ||
(ST->hasPackedFP32Ops() && SLT == MVT::f32))
NElts = (NElts + 1) / 2;
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index cb3e544449bbf..761e235f39df5 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -850,6 +850,13 @@ SITargetLowering::SITargetLowering(const TargetMachine &TM,
setOperationAction(ISD::SELECT, {MVT::v4i16, MVT::v4f16, MVT::v4bf16},
Custom);
+ if (Subtarget->hasBF16PackedInsts()) {
+ for (MVT VT : {MVT::v4bf16, MVT::v8bf16, MVT::v16bf16, MVT::v32bf16})
+ // Split vector operations.
+ setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FCANONICALIZE},
+ VT, Custom);
+ }
+
if (Subtarget->hasPackedFP32Ops()) {
setOperationAction({ISD::FADD, ISD::FMUL, ISD::FMA, ISD::FNEG},
MVT::v2f32, Legal);
diff --git a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
index 7ac4db3119210..904db9064a369 100644
--- a/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
+++ b/llvm/test/Analysis/CostModel/AMDGPU/canonicalize.ll
@@ -3,11 +3,13 @@
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX8 %s
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX9 %s
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=ALL,GFX10 %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=throughput 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250 %s
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,BASE-SIZE %s
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx803 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX8-SIZE %s
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx900 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX9-SIZE %s
; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1030 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=ALL-SIZE,GFX10-SIZE %s
+; RUN: opt < %s -mtriple=amdgcn-unknown-amdhsa -mcpu=gfx1250 -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output | FileCheck -check-prefixes=GFX1250-SIZE %s
define void @canonicalize_f16() {
; BASE-LABEL: 'canonicalize_f16'
@@ -141,6 +143,16 @@ define void @canonicalize_bf16() {
; GFX10-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
; GFX10-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
+; GFX1250-LABEL: 'canonicalize_bf16'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
; BASE-SIZE-LABEL: 'canonicalize_bf16'
; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
; BASE-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
@@ -181,6 +193,15 @@ define void @canonicalize_bf16() {
; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
; GFX10-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
+; GFX1250-SIZE-LABEL: 'canonicalize_bf16'
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4bf16 = call <4 x bfloat> @llvm.canonicalize.v4bf16(<4 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v5bf16 = call <5 x bfloat> @llvm.canonicalize.v5bf16(<5 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v16bf16 = call <16 x bfloat> @llvm.canonicalize.v16bf16(<16 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v17bf16 = call <17 x bfloat> @llvm.canonicalize.v17bf16(<17 x bfloat> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
%bf16 = call bfloat @llvm.canonicalize.bf16(bfloat undef) #1
%v2bf16 = call <2 x bfloat> @llvm.canonicalize.v2bf16(<2 x bfloat> undef) #1
%v3bf16 = call <3 x bfloat> @llvm.canonicalize.v3bf16(<3 x bfloat> undef) #1
@@ -203,6 +224,17 @@ define void @canonicalize_f32() {
; ALL-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
+; GFX1250-LABEL: 'canonicalize_f32'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
; ALL-SIZE-LABEL: 'canonicalize_f32'
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
@@ -214,6 +246,16 @@ define void @canonicalize_f32() {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
+; GFX1250-SIZE-LABEL: 'canonicalize_f32':
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %f32 = call float @llvm.canonicalize.f32(float undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: %v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %v4f32 = call <4 x float> @llvm.canonicalize.v4f32(<4 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 3 for instruction: %v5f32 = call <5 x float> @llvm.canonicalize.v5f32(<5 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v8f32 = call <8 x float> @llvm.canonicalize.v8f32(<8 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 15 for instruction: %v9f32 = call <9 x float> @llvm.canonicalize.v9f32(<9 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 24 for instruction: %v16f32 = call <16 x float> @llvm.canonicalize.v16f32(<16 x float> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
%f32 = call float @llvm.canonicalize.f32(float undef) #1
%v2f32 = call <2 x float> @llvm.canonicalize.v2f32(<2 x float> undef) #1
%v3f32 = call <3 x float> @llvm.canonicalize.v3f32(<3 x float> undef) #1
@@ -236,6 +278,16 @@ define void @canonicalize_f64() {
; ALL-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
; ALL-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
;
+; GFX1250-LABEL: 'canonicalize_f64'
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 12 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 16 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 96 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 320 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; GFX1250-NEXT: Cost Model: Found an estimated cost of 10 for instruction: ret void
+;
; ALL-SIZE-LABEL: 'canonicalize_f64'
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
@@ -245,6 +297,16 @@ define void @canonicalize_f64() {
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
; ALL-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
+;
+; GFX1250-SIZE-LABEL: 'canonicalize_f64'
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 2 for instruction: %f64 = call double @llvm.canonicalize.f64(double undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 4 for instruction: %v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 6 for instruction: %v3f64 = call <3 x double> @llvm.canonicalize.v3f64(<3 x double> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 8 for instruction: %v4f64 = call <4 x double> @llvm.canonicalize.v4f64(<4 x double> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v5f64 = call <5 x double> @llvm.canonicalize.v5f64(<5 x double> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 48 for instruction: %v8f64 = call <8 x double> @llvm.canonicalize.v8f64(<8 x double> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 160 for instruction: %v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef)
+; GFX1250-SIZE-NEXT: Cost Model: Found an estimated cost of 1 for instruction: ret void
;
%f64 = call double @llvm.canonicalize.f64(double undef) #1
%v2f64 = call <2 x double> @llvm.canonicalize.v2f64(<2 x double> undef) #1
@@ -255,9 +317,3 @@ define void @canonicalize_f64() {
%v16f64 = call <16 x double> @llvm.canonicalize.v16f64(<16 x double> undef) #1
ret void
}
-
-
-
-
-
-
diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll
index 44c719f3635c8..84a4440e03977 100644
--- a/llvm/test/CodeGen/AMDGPU/bf16.ll
+++ b/llvm/test/CodeGen/AMDGPU/bf16.ll
@@ -7,6 +7,7 @@
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1010 | FileCheck %s -check-prefixes=GFX10
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11TRUE16
; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 | FileCheck %s -check-prefixes=GFX11,GFX11FAKE16
+; RUN: llc < %s -mtriple=amdgcn -mcpu=gfx1250 | FileCheck %s -check-prefixes=GFX1250
define void @test_load_store(ptr addrspace(1) %in, ptr addrspace(1) %out) {
; GCN-LABEL: test_load_store:
@@ -9859,6 +9860,12 @@ define <2 x bfloat> @v_fadd_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fadd <2 x bfloat> %a, %b
ret <2 x bfloat> %op
}
@@ -10383,6 +10390,13 @@ define <4 x bfloat> @v_fadd_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v2
+; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fadd <4 x bfloat> %a, %b
ret <4 x bfloat> %op
}
@@ -10921,6 +10935,15 @@ define <8 x bfloat> @v_fadd_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v8bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v4
+; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v5
+; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v6
+; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v7
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fadd <8 x bfloat> %a, %b
ret <8 x bfloat> %op
}
@@ -14147,6 +14170,29 @@ define bfloat @v_fadd_bf16_fpimm_0(bfloat %arg0) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_lshrrev_b32_e32 v0, 16, v0
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fadd_v32bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: scratch_load_b32 v31, off, s32
+; GFX1250-NEXT: v_pk_add_bf16 v0, v0, v16
+; GFX1250-NEXT: v_pk_add_bf16 v1, v1, v17
+; GFX1250-NEXT: v_pk_add_bf16 v2, v2, v18
+; GFX1250-NEXT: v_pk_add_bf16 v3, v3, v19
+; GFX1250-NEXT: v_pk_add_bf16 v4, v4, v20
+; GFX1250-NEXT: v_pk_add_bf16 v5, v5, v21
+; GFX1250-NEXT: v_pk_add_bf16 v6, v6, v22
+; GFX1250-NEXT: v_pk_add_bf16 v7, v7, v23
+; GFX1250-NEXT: v_pk_add_bf16 v8, v8, v24
+; GFX1250-NEXT: v_pk_add_bf16 v9, v9, v25
+; GFX1250-NEXT: v_pk_add_bf16 v10, v10, v26
+; GFX1250-NEXT: v_pk_add_bf16 v11, v11, v27
+; GFX1250-NEXT: v_pk_add_bf16 v12, v12, v28
+; GFX1250-NEXT: v_pk_add_bf16 v13, v13, v29
+; GFX1250-NEXT: v_pk_add_bf16 v14, v14, v30
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_add_bf16 v15, v15, v31
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%add = fadd bfloat %arg0, 1.0
ret bfloat %add
}
@@ -15351,6 +15397,12 @@ define <2 x bfloat> @v_fmul_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v0, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v1
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fmul <2 x bfloat> %a, %b
ret <2 x bfloat> %op
}
@@ -15875,6 +15927,13 @@ define <4 x bfloat> @v_fmul_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v1, v4, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v2
+; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v3
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fmul <4 x bfloat> %a, %b
ret <4 x bfloat> %op
}
@@ -16413,6 +16472,15 @@ define <8 x bfloat> @v_fmul_v8bf16(<8 x bfloat> %a, <8 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v3, v3, v8, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v8bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v4
+; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v5
+; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v6
+; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v7
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fmul <8 x bfloat> %a, %b
ret <8 x bfloat> %op
}
@@ -19535,6 +19603,29 @@ define <32 x bfloat> @v_fmul_v32bf16(<32 x bfloat> %a, <32 x bfloat> %b) {
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v15, v15, v17, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fmul_v32bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: scratch_load_b32 v31, off, s32
+; GFX1250-NEXT: v_pk_mul_bf16 v0, v0, v16
+; GFX1250-NEXT: v_pk_mul_bf16 v1, v1, v17
+; GFX1250-NEXT: v_pk_mul_bf16 v2, v2, v18
+; GFX1250-NEXT: v_pk_mul_bf16 v3, v3, v19
+; GFX1250-NEXT: v_pk_mul_bf16 v4, v4, v20
+; GFX1250-NEXT: v_pk_mul_bf16 v5, v5, v21
+; GFX1250-NEXT: v_pk_mul_bf16 v6, v6, v22
+; GFX1250-NEXT: v_pk_mul_bf16 v7, v7, v23
+; GFX1250-NEXT: v_pk_mul_bf16 v8, v8, v24
+; GFX1250-NEXT: v_pk_mul_bf16 v9, v9, v25
+; GFX1250-NEXT: v_pk_mul_bf16 v10, v10, v26
+; GFX1250-NEXT: v_pk_mul_bf16 v11, v11, v27
+; GFX1250-NEXT: v_pk_mul_bf16 v12, v12, v28
+; GFX1250-NEXT: v_pk_mul_bf16 v13, v13, v29
+; GFX1250-NEXT: v_pk_mul_bf16 v14, v14, v30
+; GFX1250-NEXT: s_wait_loadcnt 0x0
+; GFX1250-NEXT: v_pk_mul_bf16 v15, v15, v31
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = fmul <32 x bfloat> %a, %b
ret <32 x bfloat> %op
}
@@ -46178,6 +46269,12 @@ define <2 x bfloat> @v_fma_v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat>
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_1)
; GFX11FAKE16-NEXT: v_perm_b32 v0, v1, v0, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fma_v2bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v1, v2
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> %a, <2 x bfloat> %b, <2 x bfloat> %c)
ret <2 x bfloat> %op
}
@@ -46780,6 +46877,13 @@ define <4 x bfloat> @v_fma_v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat>
; GFX11FAKE16-NEXT: s_delay_alu instid0(VALU_DEP_2)
; GFX11FAKE16-NEXT: v_perm_b32 v1, v4, v1, 0x7060302
; GFX11FAKE16-NEXT: s_setpc_b64 s[30:31]
+; GFX1250-LABEL: v_fma_v4bf16:
+; GFX1250: ; %bb.0:
+; GFX1250-NEXT: s_wait_loadcnt_dscnt 0x0
+; GFX1250-NEXT: s_wait_kmcnt 0x0
+; GFX1250-NEXT: v_pk_fma_bf16 v0, v0, v2, v4
+; GFX1250-NEXT: v_pk_fma_bf16 v1, v1, v3, v5
+; GFX1250-NEXT: s_set_pc_i64 s[30:31]
%op = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> %a, <4 x bfloat> %b, <4 x bfloat> %c)
ret <4 x bfloat> %op
}
More information about the llvm-commits
mailing list