[llvm] [CostModel][X86] SK_InsertSubvector inserted into the lowest subvector should be treated as SK_Select blend (PR #145892)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jun 26 06:24:49 PDT 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/145892
X86 uses implicit widening and BLEND/MOV shuffles in these cases - otherwise we still treat it as a SK_PermuteTwoSrc
>From bb860d7b216589eaa5788d13756a9664aa968cce Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 26 Jun 2025 14:23:38 +0100
Subject: [PATCH] [CostModel][X86] SK_InsertSubvector inserted into the lowest
subvector should be treated as SK_Select blend
X86 uses implicit widening and BLEND/MOV shuffle in these cases - otherwise we still treat it as a SK_PermuteTwoSrc
---
.../lib/Target/X86/X86TargetTransformInfo.cpp | 6 +++--
.../test/Transforms/PhaseOrdering/X86/hadd.ll | 13 +++-------
.../test/Transforms/PhaseOrdering/X86/hsub.ll | 13 +++-------
.../X86/extracts-non-extendable.ll | 26 ++++++-------------
4 files changed, 20 insertions(+), 38 deletions(-)
diff --git a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
index e73bec2e22a57..37a7b37e4133a 100644
--- a/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
+++ b/llvm/lib/Target/X86/X86TargetTransformInfo.cpp
@@ -1676,8 +1676,10 @@ InstructionCost X86TTIImpl::getShuffleCost(TTI::ShuffleKind Kind,
SubLT.second == MVT::f32 && (Index == 0 || ST->hasSSE41()))
return 1;
- // If the insertion isn't aligned, treat it like a 2-op shuffle.
- Kind = TTI::SK_PermuteTwoSrc;
+ // If the insertion is the lowest subvector then it will be blended
+ // otherwise treat it like a 2-op shuffle.
+ Kind =
+ (Index == 0 && LT.first == 1) ? TTI::SK_Select : TTI::SK_PermuteTwoSrc;
}
// Handle some common (illegal) sub-vector types as they are often very cheap
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 9bfd92ef35a46..b645288f96028 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -290,13 +290,10 @@ define <16 x i16> @add_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @add_v16i16_0123u56789uBCDEF(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 26, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 27, i32 poison, i32 31, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 24, i32 26, i32 28, i32 30>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 25, i32 27, i32 29, i32 31>
; SSE4-NEXT: [[TMP9:%.*]] = add <16 x i16> [[TMP7]], [[TMP8]]
; SSE4-NEXT: ret <16 x i16> [[TMP9]]
@@ -404,11 +401,9 @@ define <16 x i16> @add_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @add_v16i16_FEuCBA98765432u0(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 27, i32 28, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 poison, i32 11, i32 12, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 26, i32 29, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP6:%.*]] = add <16 x i16> [[TMP4]], [[TMP5]]
; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index 13b4d7da97c9d..c7ebb73e8e35c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -290,13 +290,10 @@ define <16 x i16> @sub_v16i16_0123u56789uBCDEF(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @sub_v16i16_0123u56789uBCDEF(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 26, i32 poison, i32 30, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <16 x i16> [[TMP4]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 27, i32 poison, i32 31, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[TMP5]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 24, i32 26, i32 28, i32 30>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 18, i32 20, i32 22, i32 8, i32 10, i32 poison, i32 14, i32 24, i32 26, i32 28, i32 30>
; SSE4-NEXT: [[TMP8:%.*]] = shufflevector <16 x i16> [[TMP6]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 poison, i32 5, i32 6, i32 7, i32 8, i32 9, i32 poison, i32 11, i32 25, i32 27, i32 29, i32 31>
; SSE4-NEXT: [[TMP9:%.*]] = sub <16 x i16> [[TMP7]], [[TMP8]]
; SSE4-NEXT: ret <16 x i16> [[TMP9]]
@@ -398,11 +395,9 @@ define <16 x i16> @sub_v16i16_FEuCBA98765432u0(<16 x i16> %a, <16 x i16> %b) {
; SSE2-NEXT: ret <16 x i16> [[RESULT]]
;
; SSE4-LABEL: @sub_v16i16_FEuCBA98765432u0(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <16 x i16> [[TMP1]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 24, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[A:%.*]], <16 x i16> [[B:%.*]], <16 x i32> <i32 1, i32 poison, i32 5, i32 7, i32 17, i32 19, i32 21, i32 23, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP10:%.*]] = shufflevector <16 x i16> [[TMP2]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 25, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[TMP3]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 26, i32 28, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 0, i32 poison, i32 4, i32 6, i32 16, i32 18, i32 20, i32 22, i32 8, i32 poison, i32 10, i32 12, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[TMP10]], <16 x i16> [[A]], <16 x i32> <i32 0, i32 poison, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 27, i32 29, i32 poison, i32 poison, i32 poison, i32 poison>
; SSE4-NEXT: [[TMP6:%.*]] = sub <16 x i16> [[TMP4]], [[TMP5]]
; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <16 x i16> [[A]], <16 x i16> [[B]], <16 x i32> <i32 14, i32 24, i32 28, i32 30, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
index d87c40511fcf7..3a7184f8e0519 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/extracts-non-extendable.ll
@@ -7,26 +7,16 @@ define void @test(i64 %v) {
; CHECK-NEXT: [[BB:.*:]]
; CHECK-NEXT: [[TMP0:%.*]] = insertelement <2 x i64> <i64 0, i64 poison>, i64 [[V]], i32 1
; CHECK-NEXT: [[TMP1:%.*]] = or <2 x i64> zeroinitializer, [[TMP0]]
-; CHECK-NEXT: [[TMP2:%.*]] = extractelement <2 x i64> [[TMP1]], i32 1
-; CHECK-NEXT: [[TMP3:%.*]] = icmp eq i64 0, [[TMP2]]
-; CHECK-NEXT: [[TMP4:%.*]] = icmp eq i64 0, 0
-; CHECK-NEXT: [[TMP5:%.*]] = and i1 [[TMP3]], [[TMP4]]
-; CHECK-NEXT: [[TMP6:%.*]] = icmp eq i64 0, 0
-; CHECK-NEXT: [[TMP7:%.*]] = and i1 [[TMP5]], [[TMP6]]
-; CHECK-NEXT: [[TMP8:%.*]] = icmp eq i64 0, 0
-; CHECK-NEXT: [[TMP9:%.*]] = and i1 [[TMP7]], [[TMP8]]
-; CHECK-NEXT: [[TMP10:%.*]] = and i1 [[TMP9]], false
-; CHECK-NEXT: [[TMP11:%.*]] = icmp eq i64 0, [[TMP2]]
-; CHECK-NEXT: [[TMP12:%.*]] = and i1 [[TMP10]], [[TMP11]]
-; CHECK-NEXT: [[TMP13:%.*]] = icmp eq i64 0, 0
-; CHECK-NEXT: [[TMP14:%.*]] = and i1 [[TMP12]], [[TMP13]]
-; CHECK-NEXT: [[TMP15:%.*]] = icmp eq i64 0, 0
-; CHECK-NEXT: [[TMP16:%.*]] = and i1 [[TMP14]], [[TMP15]]
-; CHECK-NEXT: [[TMP17:%.*]] = icmp eq i64 0, 0
-; CHECK-NEXT: [[TMP18:%.*]] = and i1 [[TMP16]], [[TMP17]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x i64> [[TMP1]], <2 x i64> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <8 x i64> [[TMP2]], <8 x i64> <i64 poison, i64 0, i64 0, i64 0, i64 poison, i64 0, i64 0, i64 0>, <8 x i32> <i32 0, i32 9, i32 10, i32 11, i32 4, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP4:%.*]] = icmp eq <8 x i64> zeroinitializer, [[TMP3]]
; CHECK-NEXT: [[TMP19:%.*]] = icmp ult i64 0, 0
+; CHECK-NEXT: [[TMP6:%.*]] = freeze <8 x i1> [[TMP4]]
+; CHECK-NEXT: [[TMP18:%.*]] = call i1 @llvm.vector.reduce.and.v8i1(<8 x i1> [[TMP6]])
; CHECK-NEXT: [[TMP20:%.*]] = select i1 [[TMP19]], i1 [[TMP18]], i1 false
-; CHECK-NEXT: br i1 [[TMP20]], label %[[BB_I107_PREHEADER:.*]], label %[[BB_I27_I_PREHEADER:.*]]
+; CHECK-NEXT: [[TMP8:%.*]] = freeze i1 [[TMP20]]
+; CHECK-NEXT: [[OP_RDX1:%.*]] = select i1 [[TMP8]], i1 false, i1 false
+; CHECK-NEXT: br i1 [[OP_RDX1]], label %[[BB_I107_PREHEADER:.*]], label %[[BB_I27_I_PREHEADER:.*]]
; CHECK: [[BB_I107_PREHEADER]]:
; CHECK-NEXT: [[TMP21:%.*]] = extractelement <2 x i64> [[TMP1]], i32 0
; CHECK-NEXT: [[DOTSROA_1278_10_EXTRACT_SHIFT83_I1622_1:%.*]] = xor i64 0, [[TMP21]]
More information about the llvm-commits
mailing list