[llvm] 1b13849 - [AArch64] Add bf16 broadcast and transpose costs

Fri May 9 14:39:02 PDT 2025

Author: David Green
Date: 2025-05-09T22:38:57+01:00
New Revision: 1b13849a9b3550d8501a9ee5cc7f2548bc319d0b

URL: https://github.com/llvm/llvm-project/commit/1b13849a9b3550d8501a9ee5cc7f2548bc319d0b
DIFF: https://github.com/llvm/llvm-project/commit/1b13849a9b3550d8501a9ee5cc7f2548bc319d0b.diff

LOG: [AArch64] Add bf16 broadcast and transpose costs

These are only based on the size of the element, not the type (although the
codegen does need to account for it).

Added: 
    

Modified: 
    llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
    llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
    llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0e0b30b5b1e9c..97e4993d52b4f 100644

--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -5607,6 +5607,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
         {TTI::SK_Broadcast, MVT::v2i64, 1},
         {TTI::SK_Broadcast, MVT::v4f16, 1},
         {TTI::SK_Broadcast, MVT::v8f16, 1},
+        {TTI::SK_Broadcast, MVT::v4bf16, 1},
+        {TTI::SK_Broadcast, MVT::v8bf16, 1},
         {TTI::SK_Broadcast, MVT::v2f32, 1},
         {TTI::SK_Broadcast, MVT::v4f32, 1},
         {TTI::SK_Broadcast, MVT::v2f64, 1},
@@ -5621,6 +5623,8 @@ InstructionCost AArch64TTIImpl::getShuffleCost(
         {TTI::SK_Transpose, MVT::v2i64, 1},
         {TTI::SK_Transpose, MVT::v4f16, 1},
         {TTI::SK_Transpose, MVT::v8f16, 1},
+        {TTI::SK_Transpose, MVT::v4bf16, 1},
+        {TTI::SK_Transpose, MVT::v8bf16, 1},
         {TTI::SK_Transpose, MVT::v2f32, 1},
         {TTI::SK_Transpose, MVT::v4f32, 1},
         {TTI::SK_Transpose, MVT::v2f64, 1},

diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
index 0730f7cbbe6ad..6175ea48c5631 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-broadcast.ll
@@ -31,10 +31,10 @@ define void @broadcast() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f16 = shufflevector <8 x half> undef, <8 x half> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %v1bf16 = shufflevector <1 x bfloat> undef, <1 x bfloat> undef, <1 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <2 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <16 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %v1f32 = shufflevector <1 x float> undef, <1 x float> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <4 x i32> zeroinitializer
@@ -116,8 +116,8 @@ define void @broadcast_half() {
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <2 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <4 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <8 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 0 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <1 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v8f32 = shufflevector <8 x float> undef, <8 x float> undef, <4 x i32> zeroinitializer
@@ -194,9 +194,9 @@ define void @broadcast_double() {
 ; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v16f16 = shufflevector <16 x half> undef, <16 x half> undef, <32 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1bf16 = shufflevector <1 x bfloat> undef, <1 x bfloat> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2bf16 = shufflevector <2 x bfloat> undef, <2 x bfloat> undef, <4 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:14 CodeSize:7 Lat:14 SizeLat:14 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <8 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:28 CodeSize:14 Lat:28 SizeLat:28 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <16 x i32> zeroinitializer
-; CHECK-NEXT:  Cost Model: Found costs of RThru:56 CodeSize:28 Lat:56 SizeLat:56 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <32 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v4bf16 = shufflevector <4 x bfloat> undef, <4 x bfloat> undef, <8 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v8bf16 = shufflevector <8 x bfloat> undef, <8 x bfloat> undef, <16 x i32> zeroinitializer
+; CHECK-NEXT:  Cost Model: Found costs of 4 for: %v16bf16 = shufflevector <16 x bfloat> undef, <16 x bfloat> undef, <32 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of RThru:2 CodeSize:1 Lat:2 SizeLat:2 for: %v1f32 = shufflevector <1 x float> undef, <1 x float> undef, <2 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 1 for: %v2f32 = shufflevector <2 x float> undef, <2 x float> undef, <4 x i32> zeroinitializer
 ; CHECK-NEXT:  Cost Model: Found costs of 2 for: %v4f32 = shufflevector <4 x float> undef, <4 x float> undef, <8 x i32> zeroinitializer

diff  --git a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
index 35cfa888d5d76..4c4843088551a 100644
--- a/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/shuffle-transpose.ll
@@ -218,3 +218,39 @@ define <8 x half> @trn2.v8f16(<8 x half> %v0, <8 x half> %v1) {
   %tmp0 = shufflevector <8 x half> %v0, <8 x half> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
   ret <8 x half> %tmp0
 }
+
+define <4 x bfloat> @trn1.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
+; CHECK-LABEL: 'trn1.v4bf16'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x bfloat> %tmp0
+}
+
+define <4 x bfloat> @trn2.v4bf16(<4 x bfloat> %v0, <4 x bfloat> %v1) {
+; CHECK-LABEL: 'trn2.v4bf16'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <4 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <4 x bfloat> %v0, <4 x bfloat> %v1, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x bfloat> %tmp0
+}
+
+define <8 x bfloat> @trn1.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
+; CHECK-LABEL: 'trn1.v8bf16'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
+  ret <8 x bfloat> %tmp0
+}
+
+define <8 x bfloat> @trn2.v8bf16(<8 x bfloat> %v0, <8 x bfloat> %v1) {
+; CHECK-LABEL: 'trn2.v8bf16'
+; CHECK-NEXT:  Cost Model: Found costs of 1 for: %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+; CHECK-NEXT:  Cost Model: Found costs of RThru:0 CodeSize:1 Lat:1 SizeLat:1 for: ret <8 x bfloat> %tmp0
+;
+  %tmp0 = shufflevector <8 x bfloat> %v0, <8 x bfloat> %v1, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
+  ret <8 x bfloat> %tmp0
+}