[llvm] [X86] Canonicalise insertps(insertps(v, s, c0), s, c1) patterns to blend(v,splat(s)) (PR #178649)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Jan 29 06:02:51 PST 2026
- Previous message: [llvm] [X86] Canonicalise insertps(insertps(v, s, c0), s, c1) patterns to blend(v,splat(s)) (PR #178649)
- Next message: [llvm] [X86] Canonicalise insertps(insertps(v, s, c0), s, c1) patterns to blend(v,splat(s)) (PR #178649)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/178649
>From 76b8b952ee0a2d19c26d03a1237beea78002264c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Thu, 29 Jan 2026 12:42:02 +0000
Subject: [PATCH] [X86] Canonicalise insertps(insertps(v,s,c0),s,c1) patterns
to blend(v,splat(s))
Avoid situations where SimplifyDemandedVectorElts / shuffle combining keeps messing with the zero masks of inner insertps - usually this is benign but if they share operands it can cause infinite loops
Fixes #178538
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 17 ++++++
llvm/test/CodeGen/X86/build-vector-128.ll | 60 ++++++++++---------
llvm/test/CodeGen/X86/build-vector-512.ll | 12 ++--
.../X86/vector-shuffle-combining-sse41.ll | 36 +++++++++++
4 files changed, 91 insertions(+), 34 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index befd71ef793e2..88ee0c7cc531a 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -43349,6 +43349,23 @@ static SDValue combineTargetShuffle(SDValue N, const SDLoc &DL,
return DAG.getNode(X86ISD::INSERTPS, DL, VT, Op0, DAG.getUNDEF(VT),
DAG.getTargetConstant(InsertPSMask, DL, MVT::i8));
+ // Chained inserts of the same src - prefer splat + blend.
+ // TODO: Splat isn't necessary if they insert into different v2f32 subs.
+ if (Op0.getOpcode() == X86ISD::INSERTPS && Op0.getOperand(1) == Op1) {
+ unsigned InnerPSMask = Op0.getConstantOperandVal(2);
+ unsigned InnerSrcIdx = (InnerPSMask >> 6) & 0x3;
+ unsigned InnerDstIdx = (InnerPSMask >> 4) & 0x3;
+ unsigned InnerZeroMask = InnerPSMask & 0xF;
+ if (SrcIdx == InnerSrcIdx && ZeroMask == 0 && InnerZeroMask == 0) {
+ SmallVector<int, 4> SplatMask(4, (int)SrcIdx);
+ Op1 = DAG.getNode(X86ISD::SHUFP, DL, VT, Op1, Op1,
+ getV4X86ShuffleImm8ForMask(SplatMask, DL, DAG));
+ unsigned BlendMask = (1 << DstIdx) | (1 << InnerDstIdx);
+ return DAG.getNode(X86ISD::BLENDI, DL, VT, Op0, Op1,
+ DAG.getTargetConstant(BlendMask, DL, MVT::i8));
+ }
+ }
+
// Attempt to merge insertps Op1 with an inner target shuffle node.
SmallVector<int, 8> TargetMask1;
SmallVector<SDValue, 2> Ops1;
diff --git a/llvm/test/CodeGen/X86/build-vector-128.ll b/llvm/test/CodeGen/X86/build-vector-128.ll
index 59eb776ac365e..51d3a65be5236 100644
--- a/llvm/test/CodeGen/X86/build-vector-128.ll
+++ b/llvm/test/CodeGen/X86/build-vector-128.ll
@@ -667,11 +667,10 @@ define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) {
;
; SSE41-64-LABEL: test_buildvector_4f32_2_var:
; SSE41-64: # %bb.0:
-; SSE41-64-NEXT: movaps %xmm0, %xmm2
-; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[2,3]
-; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1],xmm1[0],xmm2[3]
-; SSE41-64-NEXT: insertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm0[0]
-; SSE41-64-NEXT: movaps %xmm2, %xmm0
+; SSE41-64-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE41-64-NEXT: movss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; SSE41-64-NEXT: insertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm0[0]
+; SSE41-64-NEXT: movaps %xmm1, %xmm0
; SSE41-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_4f32_2_var:
@@ -681,12 +680,19 @@ define <4 x float> @test_buildvector_4f32_2_var(float %a0, float %a1) {
; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
; AVX-32-NEXT: retl
;
-; AVX-64-LABEL: test_buildvector_4f32_2_var:
-; AVX-64: # %bb.0:
-; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX-64-NEXT: vinsertps {{.*#+}} xmm1 = xmm2[0,1],xmm1[0],xmm2[3]
-; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
-; AVX-64-NEXT: retq
+; AVX1-64-LABEL: test_buildvector_4f32_2_var:
+; AVX1-64: # %bb.0:
+; AVX1-64-NEXT: vshufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-64-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX1-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX1-64-NEXT: retq
+;
+; AVX2-64-LABEL: test_buildvector_4f32_2_var:
+; AVX2-64: # %bb.0:
+; AVX2-64-NEXT: vbroadcastss %xmm1, %xmm1
+; AVX2-64-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX2-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
+; AVX2-64-NEXT: retq
%v0 = insertelement <4 x float> poison, float %a0, i32 0
%v1 = insertelement <4 x float> %v0, float %a1, i32 1
%v2 = insertelement <4 x float> %v1, float %a1, i32 2
@@ -722,21 +728,19 @@ define <4 x float> @test_buildvector_4f32_2_load(ptr %p0, ptr %p1) {
; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SSE41-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; SSE41-32-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-32-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-32-NEXT: movaps %xmm2, %xmm0
-; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; SSE41-32-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-32-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-32-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-32-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; SSE41-32-NEXT: retl
;
; SSE41-64-LABEL: test_buildvector_4f32_2_load:
; SSE41-64: # %bb.0:
; SSE41-64-NEXT: movss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; SSE41-64-NEXT: movss {{.*#+}} xmm2 = mem[0],zero,zero,zero
-; SSE41-64-NEXT: movaps %xmm2, %xmm0
-; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm2[0]
+; SSE41-64-NEXT: movss {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; SSE41-64-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,0,0,0]
+; SSE41-64-NEXT: movss {{.*#+}} xmm0 = xmm1[0],xmm0[1,2,3]
+; SSE41-64-NEXT: insertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
; SSE41-64-NEXT: retq
;
; AVX-32-LABEL: test_buildvector_4f32_2_load:
@@ -744,19 +748,17 @@ define <4 x float> @test_buildvector_4f32_2_load(ptr %p0, ptr %p1) {
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %eax
; AVX-32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; AVX-32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
-; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-32-NEXT: vbroadcastss (%eax), %xmm1
+; AVX-32-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX-32-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX-32-NEXT: retl
;
; AVX-64-LABEL: test_buildvector_4f32_2_load:
; AVX-64: # %bb.0:
; AVX-64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = mem[0],zero,zero,zero
-; AVX-64-NEXT: vinsertps {{.*#+}} xmm2 = xmm1[0],xmm0[0],xmm1[2,3]
-; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm2[0,1],xmm0[0],xmm2[3]
-; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-64-NEXT: vbroadcastss (%rsi), %xmm1
+; AVX-64-NEXT: vmovss {{.*#+}} xmm1 = xmm0[0],xmm1[1,2,3]
+; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm1[0,1,2],xmm0[0]
; AVX-64-NEXT: retq
%a0 = load float, ptr %p0
%a1 = load float, ptr %p1
diff --git a/llvm/test/CodeGen/X86/build-vector-512.ll b/llvm/test/CodeGen/X86/build-vector-512.ll
index 69d17fe3ab69f..ff54288cfa5e0 100644
--- a/llvm/test/CodeGen/X86/build-vector-512.ll
+++ b/llvm/test/CodeGen/X86/build-vector-512.ll
@@ -591,12 +591,14 @@ define <16 x float> @test_buildvector_16f32_2_var(float %a0, float %a1) {
; AVX-64-NEXT: # kill: def $xmm1 killed $xmm1 def $zmm1
; AVX-64-NEXT: vpmovsxbd {{.*#+}} xmm2 = [0,16,0,0]
; AVX-64-NEXT: vbroadcastss %xmm0, %xmm0
+; AVX-64-NEXT: vmovaps %zmm1, %zmm3
+; AVX-64-NEXT: vpermt2ps %zmm0, %zmm2, %zmm3
+; AVX-64-NEXT: vmovss {{.*#+}} xmm4 = xmm1[0],xmm0[1,2,3]
+; AVX-64-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
; AVX-64-NEXT: vpermi2ps %zmm1, %zmm0, %zmm2
-; AVX-64-NEXT: vinsertps {{.*#+}} xmm3 = xmm0[0,1,2],xmm1[0]
-; AVX-64-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX-64-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,16,0,0,0,17,18,19]
-; AVX-64-NEXT: vpermi2ps %zmm0, %zmm1, %zmm3
-; AVX-64-NEXT: vinsertf64x4 $1, %ymm3, %zmm2, %zmm0
+; AVX-64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
+; AVX-64-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
+; AVX-64-NEXT: vinsertf64x4 $1, %ymm3, %zmm0, %zmm0
; AVX-64-NEXT: retq
%v0 = insertelement <16 x float> poison, float %a0, i32 0
%v1 = insertelement <16 x float> %v0, float %a1, i32 1
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
index ac5830604461c..26f2932848ea4 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-sse41.ll
@@ -214,3 +214,39 @@ define <16 x i8> @PR50049(ptr %p1, ptr %p2) {
%r = mul <16 x i8> %s1, %s2
ret <16 x i8> %r
}
+
+define <4 x float> @PR178538(<4 x float> %a0) {
+; SSE-LABEL: PR178538:
+; SSE: # %bb.0:
+; SSE-NEXT: shufps {{.*#+}} xmm0 = xmm0[1,1],mem[2,3]
+; SSE-NEXT: movss {{.*#+}} xmm1 = [1.0E+0,0.0E+0,0.0E+0,0.0E+0]
+; SSE-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; SSE-NEXT: blendps {{.*#+}} xmm0 = xmm0[0],xmm1[1],xmm0[2],xmm1[3]
+; SSE-NEXT: retq
+;
+; AVX1-LABEL: PR178538:
+; AVX1: # %bb.0:
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX1-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],xmm1[1,2,3]
+; AVX1-NEXT: retq
+;
+; AVX2-LABEL: PR178538:
+; AVX2: # %bb.0:
+; AVX2-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX2-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[1],xmm1[1,2,3]
+; AVX2-NEXT: retq
+;
+; AVX512-LABEL: PR178538:
+; AVX512: # %bb.0:
+; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 def $zmm0
+; AVX512-NEXT: vbroadcastss {{.*#+}} xmm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; AVX512-NEXT: vpmovsxbd {{.*#+}} xmm2 = [17,1,2,0]
+; AVX512-NEXT: vpermt2ps %zmm0, %zmm2, %zmm1
+; AVX512-NEXT: vmovaps %xmm1, %xmm0
+; AVX512-NEXT: vzeroupper
+; AVX512-NEXT: retq
+ %shuffle = shufflevector <4 x float> <float poison, float poison, float 1.000000e+00, float poison>, <4 x float> %a0, <4 x i32> <i32 5, i32 poison, i32 2, i32 poison>
+ %insert1 = insertelement <4 x float> %shuffle, float 1.000000e+00, i64 1
+ %insert3 = insertelement <4 x float> %insert1, float 1.000000e+00, i64 3
+ ret <4 x float> %insert3
+}
- Previous message: [llvm] [X86] Canonicalise insertps(insertps(v, s, c0), s, c1) patterns to blend(v,splat(s)) (PR #178649)
- Next message: [llvm] [X86] Canonicalise insertps(insertps(v, s, c0), s, c1) patterns to blend(v,splat(s)) (PR #178649)
- Messages sorted by:
[ date ]
[ thread ]
[ subject ]
[ author ]
More information about the llvm-commits
mailing list