[llvm] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx) --> binop(insert(x,a,idx),insert(x,a,idx)) (PR #124909)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Wed Jan 29 03:42:20 PST 2025
https://github.com/RKSimon created https://github.com/llvm/llvm-project/pull/124909
Add foldInsExtBinop fold to cleanup missed vectorization cases which can happen on targets with cheap insert/extract instructions which prevent foldExtractExtract (binop(extract(x),extract(y)) -> extract(binop(x,shuffle(y)))) from helping with the merge.
WIP - still need to add VectorCombine specific tests
>From 1e285708cb56f24b20c868328f89e12001916227 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Wed, 29 Jan 2025 11:41:16 +0000
Subject: [PATCH] [WIP][VectorCombine] Fold insert(binop(x,y),binop(a,b),idx)
--> binop(insert(x,a,idx),insert(x,a,idx))
Add foldInsExtBinop fold to cleanup missed vectorization cases which can happen on targets with cheap insert/extract instructions which prevent foldExtractExtract (binop(extract(x),extract(y)) -> extract(binop(x,shuffle(y)))) from helping with the merge.
WIP - still need to add VectorCombine specific tests
---
.../Transforms/Vectorize/VectorCombine.cpp | 60 ++++++++++++
.../test/Transforms/PhaseOrdering/X86/hadd.ll | 94 +++++--------------
.../test/Transforms/PhaseOrdering/X86/hsub.ll | 93 +++++-------------
.../Transforms/PhaseOrdering/X86/pr50392.ll | 10 +-
4 files changed, 109 insertions(+), 148 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 59920b5a4dd20a..09e4b8cc17fbbc 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -108,6 +108,7 @@ class VectorCombine {
Instruction &I);
bool foldExtractExtract(Instruction &I);
bool foldInsExtFNeg(Instruction &I);
+ bool foldInsExtBinop(Instruction &I);
bool foldInsExtVectorToShuffle(Instruction &I);
bool foldBitcastShuffle(Instruction &I);
bool scalarizeBinopOrCmp(Instruction &I);
@@ -738,6 +739,64 @@ bool VectorCombine::foldInsExtFNeg(Instruction &I) {
return true;
}
+/// Try to fold insert(binop(x,y),binop(a,b),idx)
+/// --> binop(insert(x,a,idx),insert(x,a,idx))
+bool VectorCombine::foldInsExtBinop(Instruction &I) {
+ BinaryOperator *VecBinOp, *SclBinOp;
+ uint64_t Index;
+ if (!match(&I,
+ m_InsertElt(m_OneUse(m_BinOp(VecBinOp)),
+ m_OneUse(m_BinOp(SclBinOp)), m_ConstantInt(Index))))
+ return false;
+
+ // TODO: Add support for addlike etc.
+ Instruction::BinaryOps BinOpcode = VecBinOp->getOpcode();
+ if (BinOpcode != SclBinOp->getOpcode())
+ return false;
+
+ auto *ResultTy = dyn_cast<FixedVectorType>(I.getType());
+ if (!ResultTy)
+ return false;
+
+ // TODO: Attempt to detect m_ExtractElt for scalar operands and convert to
+ // shuffle?
+
+ InstructionCost OldCost = TTI.getInstructionCost(&I, CostKind) +
+ TTI.getInstructionCost(VecBinOp, CostKind) +
+ TTI.getInstructionCost(SclBinOp, CostKind);
+ InstructionCost NewCost =
+ TTI.getArithmeticInstrCost(BinOpcode, ResultTy, CostKind) +
+ TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
+ Index, VecBinOp->getOperand(0),
+ SclBinOp->getOperand(0)) +
+ TTI.getVectorInstrCost(Instruction::InsertElement, ResultTy, CostKind,
+ Index, VecBinOp->getOperand(1),
+ SclBinOp->getOperand(1));
+
+ LLVM_DEBUG(dbgs() << "Found an insertion of two binops: " << I
+ << "\n OldCost: " << OldCost << " vs NewCost: " << NewCost
+ << "\n");
+ if (NewCost > OldCost)
+ return false;
+
+ Value *NewIns0 = Builder.CreateInsertElement(VecBinOp->getOperand(0),
+ SclBinOp->getOperand(0), Index);
+ Value *NewIns1 = Builder.CreateInsertElement(VecBinOp->getOperand(1),
+ SclBinOp->getOperand(1), Index);
+ Value *NewBO = Builder.CreateBinOp(BinOpcode, NewIns0, NewIns1);
+
+ // Intersect flags from the old binops.
+ if (auto *NewInst = dyn_cast<Instruction>(NewBO)) {
+ NewInst->copyIRFlags(VecBinOp);
+ NewInst->andIRFlags(SclBinOp);
+ }
+
+ Worklist.pushValue(NewIns0);
+ Worklist.pushValue(NewIns1);
+ replaceValue(I, *NewBO);
+ return true;
+}
+
/// If this is a bitcast of a shuffle, try to bitcast the source vector to the
/// destination type followed by shuffle. This can enable further transforms by
/// moving bitcasts or shuffles together.
@@ -3206,6 +3265,7 @@ bool VectorCombine::run() {
switch (Opcode) {
case Instruction::InsertElement:
MadeChange |= foldInsExtFNeg(I);
+ MadeChange |= foldInsExtBinop(I);
MadeChange |= foldInsExtVectorToShuffle(I);
break;
case Instruction::ShuffleVector:
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
index 0c9f279c01bae0..10a88a66b369a1 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -378,31 +378,11 @@ define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
-; SSE2-LABEL: @add_v8i32_01234u67(
-; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
-; SSE2-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT: ret <8 x i32> [[TMP4]]
-;
-; SSE4-LABEL: @add_v8i32_01234u67(
-; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
-; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE4-NEXT: [[A45:%.*]] = add i32 [[A4]], [[A5]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP2]], [[TMP5]]
-; SSE4-NEXT: [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP7]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x i32> [[RESULT]]
-;
-; AVX-LABEL: @add_v8i32_01234u67(
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
-; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
-; AVX-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
-; AVX-NEXT: ret <8 x i32> [[TMP7]]
+; CHECK-LABEL: @add_v8i32_01234u67(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 13, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 12, i32 15>
+; CHECK-NEXT: [[RESULT:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -700,33 +680,21 @@ define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE2-LABEL: @add_v8f32_012u4567(
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE2-NEXT: [[A67:%.*]] = fadd float [[A6]], [[A7]]
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 5, i32 6>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 4, i32 7>
; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP5]], [[TMP8]]
-; SSE2-NEXT: [[HADD5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE2-NEXT: ret <8 x float> [[RESULT]]
;
; SSE4-LABEL: @add_v8f32_012u4567(
-; SSE4-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE4-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE4-NEXT: [[A67:%.*]] = fadd float [[A6]], [[A7]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP2]], [[TMP5]]
-; SSE4-NEXT: [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 12, i32 15>
; SSE4-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP7]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x float> [[RESULT]]
+; SSE4-NEXT: ret <8 x float> [[TMP6]]
;
; AVX-LABEL: @add_v8f32_012u4567(
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 13, i32 14>
@@ -873,14 +841,10 @@ define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_u123(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @add_v4f64_u123(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
@@ -921,14 +885,10 @@ define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_0u23(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[TMP1:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 0, i32 poison, i32 3, i32 7>
; SSE4-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP4]]
;
; AVX-LABEL: @add_v4f64_0u23(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 1, i32 poison, i32 2, i32 6>
@@ -969,14 +929,10 @@ define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_01u3(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @add_v4f64_01u3(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
@@ -1017,14 +973,10 @@ define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @add_v4f64_012u(
-; SSE4-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
-; SSE4-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE4-NEXT: [[A23:%.*]] = fadd double [[A2]], [[A3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @add_v4f64_012u(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
index ae05f6470e5636..08b0d6b950c2d2 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -378,31 +378,11 @@ define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
}
define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
-; SSE2-LABEL: @sub_v8i32_01234u67(
-; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
-; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
-; SSE2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[TMP3]]
-; SSE2-NEXT: ret <8 x i32> [[TMP4]]
-;
-; SSE4-LABEL: @sub_v8i32_01234u67(
-; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
-; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
-; SSE4-NEXT: [[A45:%.*]] = sub i32 [[A4]], [[A5]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP7]], [[TMP5]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x i32> [[RESULT]]
-;
-; AVX-LABEL: @sub_v8i32_01234u67(
-; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
-; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
-; AVX-NEXT: [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
-; AVX-NEXT: ret <8 x i32> [[TMP7]]
+; CHECK-LABEL: @sub_v8i32_01234u67(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; CHECK-NEXT: [[RESULT:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x i32> [[RESULT]]
;
%a0 = extractelement <8 x i32> %a, i32 0
%a1 = extractelement <8 x i32> %a, i32 1
@@ -700,33 +680,21 @@ define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
; SSE2-LABEL: @sub_v8f32_012u4567(
-; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE2-NEXT: [[A67:%.*]] = fsub float [[A6]], [[A7]]
; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
-; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP8:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 poison, i32 poison>
; SSE2-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP5]], [[TMP8]]
-; SSE2-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[TMP6]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
; SSE2-NEXT: ret <8 x float> [[RESULT]]
;
; SSE4-LABEL: @sub_v8f32_012u4567(
-; SSE4-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
-; SSE4-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
-; SSE4-NEXT: [[A67:%.*]] = fsub float [[A6]], [[A7]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP2]], [[TMP5]]
-; SSE4-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; SSE4-NEXT: [[TMP7:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
; SSE4-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP7]]
-; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
-; SSE4-NEXT: ret <8 x float> [[RESULT]]
+; SSE4-NEXT: ret <8 x float> [[TMP6]]
;
; AVX-LABEL: @sub_v8f32_012u4567(
; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
@@ -873,14 +841,10 @@ define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_u123(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_u123(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
@@ -921,13 +885,10 @@ define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_0u23(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[TMP1:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 3, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
; SSE4-NEXT: [[TMP2:%.*]] = fsub <4 x double> [[TMP1]], [[TMP3]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP2]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP2]]
;
; AVX-LABEL: @sub_v4f64_0u23(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
@@ -968,14 +929,10 @@ define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_01u3(
-; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
-; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_01u3(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
@@ -1016,14 +973,10 @@ define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[RESULT]]
;
; SSE4-LABEL: @sub_v4f64_012u(
-; SSE4-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
-; SSE4-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
-; SSE4-NEXT: [[A23:%.*]] = fsub double [[A2]], [[A3]]
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
-; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP2]], [[TMP4]]
-; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
-; SSE4-NEXT: ret <4 x double> [[RESULT]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @sub_v4f64_012u(
; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
index d92df9741644b7..842cf42505673c 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/pr50392.ll
@@ -21,14 +21,10 @@ define <4 x double> @PR50392(<4 x double> %a, <4 x double> %b) {
; SSE2-NEXT: ret <4 x double> [[SHUFFLE]]
;
; SSE4-LABEL: @PR50392(
-; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 poison>
-; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
-; SSE4-NEXT: [[VECEXT10:%.*]] = extractelement <4 x double> [[B]], i64 2
-; SSE4-NEXT: [[VECEXT11:%.*]] = extractelement <4 x double> [[B]], i64 3
-; SSE4-NEXT: [[ADD12:%.*]] = fadd double [[VECEXT10]], [[VECEXT11]]
-; SSE4-NEXT: [[SHUFFLE:%.*]] = insertelement <4 x double> [[TMP3]], double [[ADD12]], i64 3
-; SSE4-NEXT: ret <4 x double> [[SHUFFLE]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
;
; AVX-LABEL: @PR50392(
; AVX-NEXT: [[B:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B1:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
More information about the llvm-commits
mailing list