[llvm] [VectorCombine] Fix the type used in foldShuffleOfIntrinsics Cost. (PR #138419)
via llvm-commits
llvm-commits at lists.llvm.org
Sat May 3 10:46:30 PDT 2025
llvmbot wrote:
<!--LLVM PR SUMMARY COMMENT-->
@llvm/pr-subscribers-llvm-transforms
Author: David Green (davemgreen)
<details>
<summary>Changes</summary>
The shuffle needn't be twice the original number of vector elements, so the intermediate type used between the shuffle and the intrinsic should use the ShuffleDstTy number of elements.
I found this when looking at shuffle costs and do not have test where it alters the output, but have added some cases where the shuffle output is not twice the size of the input.
---
Full diff: https://github.com/llvm/llvm-project/pull/138419.diff
2 Files Affected:
- (modified) llvm/lib/Transforms/Vectorize/VectorCombine.cpp (+1-1)
- (modified) llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll (+35)
``````````diff
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 04c084ffdda97..352a816fe0af6 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2377,7 +2377,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
} else {
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
- VecTy->getNumElements() * 2));
+ ShuffleDstTy->getNumElements()));
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
VecTy, OldMask, CostKind);
}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
index 2dc76cbfdda41..1e17e4db24ad4 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
@@ -69,6 +69,20 @@ entry:
ret <8 x i1> %4
}
+define <2 x i1> @test4b(<4 x float> %0, <4 x float> %1) {
+; CHECK-LABEL: @test4b(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[TMP2]], i32 0)
+; CHECK-NEXT: ret <2 x i1> [[TMP3]]
+;
+entry:
+ %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
+ %3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
+ %4 = shufflevector <4 x i1> %2, <4 x i1> %3, <2 x i32> <i32 0, i32 4>
+ ret <2 x i1> %4
+}
+
define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
@@ -84,6 +98,27 @@ entry:
ret <8 x float> %6
}
+define <2 x float> @test6(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1, <4 x float> %a2, <4 x float> %b2, <4 x float> %c2) {
+; SSE-LABEL: @test6(
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B1:%.*]], <4 x float> [[B2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[C1:%.*]], <4 x float> [[C2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT: [[S:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
+; SSE-NEXT: ret <2 x float> [[S]]
+;
+; AVX-LABEL: @test6(
+; AVX-NEXT: [[F1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A1:%.*]], <4 x float> [[B1:%.*]], <4 x float> [[C1:%.*]])
+; AVX-NEXT: [[F2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A2:%.*]], <4 x float> [[B2:%.*]], <4 x float> [[C2:%.*]])
+; AVX-NEXT: [[S:%.*]] = shufflevector <4 x float> [[F1]], <4 x float> [[F2]], <2 x i32> <i32 0, i32 4>
+; AVX-NEXT: ret <2 x float> [[S]]
+;
+ %f1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1)
+ %f2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a2, <4 x float> %b2, <4 x float> %c2)
+ %s = shufflevector <4 x float> %f1, <4 x float> %f2, <2 x i32> <i32 0, i32 4>
+ ret <2 x float> %s
+}
+
+
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)
``````````
</details>
https://github.com/llvm/llvm-project/pull/138419
More information about the llvm-commits
mailing list