[llvm] fff12fb - [VectorCombine] Fix the type used in foldShuffleOfIntrinsics Cost. (#138419)
via llvm-commits
llvm-commits at lists.llvm.org
Fri May 9 00:55:39 PDT 2025
Author: David Green
Date: 2025-05-09T08:55:36+01:00
New Revision: fff12fbdb9a20d353cc578076232dcb3f5c60586
URL: https://github.com/llvm/llvm-project/commit/fff12fbdb9a20d353cc578076232dcb3f5c60586
DIFF: https://github.com/llvm/llvm-project/commit/fff12fbdb9a20d353cc578076232dcb3f5c60586.diff
LOG: [VectorCombine] Fix the type used in foldShuffleOfIntrinsics Cost. (#138419)
The shuffle needn't be twice the original number of vector elements, so
the intermediate type used between the shuffle and the intrinsic should
use the ShuffleDstTy number of elements.
I found this when looking at shuffle costs and do not have test where it
alters the output, but have added some cases where the shuffle output is
not twice the size of the input.
Added:
Modified:
llvm/lib/Transforms/Vectorize/VectorCombine.cpp
llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
Removed:
################################################################################
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index e58789b5d5641..c7d221e8d1e5c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2376,7 +2376,7 @@ bool VectorCombine::foldShuffleOfIntrinsics(Instruction &I) {
} else {
auto *VecTy = cast<FixedVectorType>(II0->getArgOperand(I)->getType());
NewArgsTy.push_back(FixedVectorType::get(VecTy->getElementType(),
- VecTy->getNumElements() * 2));
+ ShuffleDstTy->getNumElements()));
NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc,
VecTy, OldMask, CostKind);
}
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
index 2dc76cbfdda41..e64e7807b7d4a 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-intrinsics.ll
@@ -69,6 +69,20 @@ entry:
ret <8 x i1> %4
}
+define <2 x i1> @test4b(<4 x float> %0, <4 x float> %1) {
+; CHECK-LABEL: @test4b(
+; CHECK-NEXT: entry:
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[TMP0:%.*]], <4 x float> [[TMP1:%.*]], <2 x i32> <i32 0, i32 4>
+; CHECK-NEXT: [[TMP3:%.*]] = call <2 x i1> @llvm.is.fpclass.v2f32(<2 x float> [[TMP2]], i32 0)
+; CHECK-NEXT: ret <2 x i1> [[TMP3]]
+;
+entry:
+ %2 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %0, i32 0)
+ %3 = call <4 x i1> @llvm.is.fpclass.v4f32(<4 x float> %1, i32 0)
+ %4 = shufflevector <4 x i1> %2, <4 x i1> %3, <2 x i32> <i32 0, i32 4>
+ ret <2 x i1> %4
+}
+
define <8 x float> @test5(<4 x float> %0, i32 %1, <4 x float> %2, <4 x i32> %3) {
; CHECK-LABEL: @test5(
; CHECK-NEXT: entry:
@@ -84,6 +98,26 @@ entry:
ret <8 x float> %6
}
+define <2 x float> @test6(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1, <4 x float> %a2, <4 x float> %b2, <4 x float> %c2) {
+; SSE-LABEL: @test6(
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A1:%.*]], <4 x float> [[A2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B1:%.*]], <4 x float> [[B2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[C1:%.*]], <4 x float> [[C2:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT: [[S:%.*]] = call <2 x float> @llvm.fma.v2f32(<2 x float> [[TMP1]], <2 x float> [[TMP2]], <2 x float> [[TMP3]])
+; SSE-NEXT: ret <2 x float> [[S]]
+;
+; AVX-LABEL: @test6(
+; AVX-NEXT: [[F1:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A1:%.*]], <4 x float> [[B1:%.*]], <4 x float> [[C1:%.*]])
+; AVX-NEXT: [[F2:%.*]] = call <4 x float> @llvm.fma.v4f32(<4 x float> [[A2:%.*]], <4 x float> [[B2:%.*]], <4 x float> [[C2:%.*]])
+; AVX-NEXT: [[S:%.*]] = shufflevector <4 x float> [[F1]], <4 x float> [[F2]], <2 x i32> <i32 0, i32 4>
+; AVX-NEXT: ret <2 x float> [[S]]
+;
+ %f1 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a1, <4 x float> %b1, <4 x float> %c1)
+ %f2 = call <4 x float> @llvm.fma.v4f32(<4 x float> %a2, <4 x float> %b2, <4 x float> %c2)
+ %s = shufflevector <4 x float> %f1, <4 x float> %f2, <2 x i32> <i32 0, i32 4>
+ ret <2 x float> %s
+}
+
declare <4 x i32> @llvm.abs.v4i32(<4 x i32>, i1)
declare <4 x i32> @llvm.smax.v4i32(<4 x i32>, <4 x i32>)
declare <4 x i1> @llvm.is.fpclass.v4f32(<4 x float>, i32)
More information about the llvm-commits
mailing list