[llvm] [AArch64] Improve vector funnel shift by constant costs. (PR #130044)

Thu Mar 6 01:59:11 PST 2025

llvmbot wrote:




@llvm/pr-subscribers-llvm-analysis

Author: David Green (davemgreen)

<details>
<summary>Changes</summary>

We now have better codegen, and can have better costs to match. The generated code should now produce a shl+usra and can be seen in testcases such as: https://github.com/llvm/llvm-project/blob/7e5821bae80db3f3f0fe0d5f8ce62f79e548eed5/llvm/test/CodeGen/AArch64/fsh.ll#L3941.

---
Full diff: https://github.com/llvm/llvm-project/pull/130044.diff


3 Files Affected:

- (modified) llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp (+4-5) 
- (modified) llvm/test/Analysis/CostModel/AArch64/fshl.ll (+8-8) 
- (modified) llvm/test/Analysis/CostModel/AArch64/fshr.ll (+8-8) 


``````````diff

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index ba019e1a4ecd5..53c6a029e9287 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -884,12 +884,11 @@ AArch64TTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
 
     const auto LegalisationCost = getTypeLegalizationCost(RetTy);
     if (OpInfoZ.isUniform()) {
-      // FIXME: The costs could be lower if the codegen is better.
       static const CostTblEntry FshlTbl[] = {
-          {Intrinsic::fshl, MVT::v4i32, 3}, // ushr + shl + orr
-          {Intrinsic::fshl, MVT::v2i64, 3}, {Intrinsic::fshl, MVT::v16i8, 4},
-          {Intrinsic::fshl, MVT::v8i16, 4}, {Intrinsic::fshl, MVT::v2i32, 3},
-          {Intrinsic::fshl, MVT::v8i8, 4},  {Intrinsic::fshl, MVT::v4i16, 4}};
+          {Intrinsic::fshl, MVT::v4i32, 2}, // shl + usra
+          {Intrinsic::fshl, MVT::v2i64, 2}, {Intrinsic::fshl, MVT::v16i8, 2},
+          {Intrinsic::fshl, MVT::v8i16, 2}, {Intrinsic::fshl, MVT::v2i32, 2},
+          {Intrinsic::fshl, MVT::v8i8, 2},  {Intrinsic::fshl, MVT::v4i16, 2}};
       // Costs for both fshl & fshr are the same, so just pass Intrinsic::fshl
       // to avoid having to duplicate the costs.
       const auto *Entry =
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshl.ll b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
index 8c6466ab470b4..c59b41157c304 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshl.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshl.ll
@@ -129,11 +129,11 @@ declare i19 @llvm.fshl.i19(i19, i19, i19)
 
 define <16 x i8> @fshl_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
 ; RECIP-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshl
 ;
 ; SIZE-LABEL: 'fshl_v16i8_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <16 x i8> @llvm.fshl.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %fshl
 ;
 entry:
@@ -173,11 +173,11 @@ declare <16 x i8> @llvm.fshl.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 
 define <8 x i16> @fshl_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
 ; RECIP-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshl
 ;
 ; SIZE-LABEL: 'fshl_v8i16_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <8 x i16> @llvm.fshl.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %fshl
 ;
 entry:
@@ -217,11 +217,11 @@ declare <8 x i16> @llvm.fshl.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
 
 define <4 x i32> @fshl_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
 ; RECIP-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshl
 ;
 ; SIZE-LABEL: 'fshl_v4i32_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <4 x i32> @llvm.fshl.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %fshl
 ;
 entry:
@@ -261,11 +261,11 @@ declare <4 x i32> @llvm.fshl.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 
 define <2 x i64> @fshl_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
 ; RECIP-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshl
 ;
 ; SIZE-LABEL: 'fshl_v2i64_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshl = tail call <2 x i64> @llvm.fshl.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %fshl
 ;
 entry:
diff --git a/llvm/test/Analysis/CostModel/AArch64/fshr.ll b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
index 120b1c4c4c4ef..e4cea1ddf77a0 100644
--- a/llvm/test/Analysis/CostModel/AArch64/fshr.ll
+++ b/llvm/test/Analysis/CostModel/AArch64/fshr.ll
@@ -129,11 +129,11 @@ declare i19 @llvm.fshr.i19(i19, i19, i19)
 
 define <16 x i8> @fshr_v16i8_3rd_arg_vec_const_all_lanes_same(<16 x i8> %a, <16 x i8> %b) {
 ; RECIP-LABEL: 'fshr_v16i8_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <16 x i8> %fshr
 ;
 ; SIZE-LABEL: 'fshr_v16i8_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <16 x i8> @llvm.fshr.v16i8(<16 x i8> %a, <16 x i8> %b, <16 x i8> splat (i8 3))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <16 x i8> %fshr
 ;
 entry:
@@ -173,11 +173,11 @@ declare <16 x i8> @llvm.fshr.v16i8(<16 x i8>, <16 x i8>, <16 x i8>)
 
 define <8 x i16> @fshr_v8i16_3rd_arg_vec_const_all_lanes_same(<8 x i16> %a, <8 x i16> %b) {
 ; RECIP-LABEL: 'fshr_v8i16_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <8 x i16> %fshr
 ;
 ; SIZE-LABEL: 'fshr_v8i16_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <8 x i16> @llvm.fshr.v8i16(<8 x i16> %a, <8 x i16> %b, <8 x i16> splat (i16 3))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <8 x i16> %fshr
 ;
 entry:
@@ -217,11 +217,11 @@ declare <8 x i16> @llvm.fshr.v8i16(<8 x i16>, <8 x i16>, <8 x i16>)
 
 define <4 x i32> @fshr_v4i32_3rd_arg_vec_const_all_lanes_same(<4 x i32> %a, <4 x i32> %b) {
 ; RECIP-LABEL: 'fshr_v4i32_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <4 x i32> %fshr
 ;
 ; SIZE-LABEL: 'fshr_v4i32_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <4 x i32> @llvm.fshr.v4i32(<4 x i32> %a, <4 x i32> %b, <4 x i32> splat (i32 3))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <4 x i32> %fshr
 ;
 entry:
@@ -261,11 +261,11 @@ declare <4 x i32> @llvm.fshr.v4i32(<4 x i32>, <4 x i32>, <4 x i32>)
 
 define <2 x i64> @fshr_v2i64_3rd_arg_vec_const_all_lanes_same(<2 x i64> %a, <2 x i64> %b) {
 ; RECIP-LABEL: 'fshr_v2i64_3rd_arg_vec_const_all_lanes_same'
-; RECIP-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; RECIP-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
 ; RECIP-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret <2 x i64> %fshr
 ;
 ; SIZE-LABEL: 'fshr_v2i64_3rd_arg_vec_const_all_lanes_same'
-; SIZE-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
+; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %fshr = tail call <2 x i64> @llvm.fshr.v2i64(<2 x i64> %a, <2 x i64> %b, <2 x i64> splat (i64 1))
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: ret <2 x i64> %fshr
 ;
 entry:

``````````

</details>


https://github.com/llvm/llvm-project/pull/130044