[llvm] [VectorCombine] Combine extract/insert from vector (PR #115213)

via llvm-commits llvm-commits at lists.llvm.org
Fri Nov 8 10:23:27 PST 2024


https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/115213

>From a89787ff849cfbc6d9b4d15ca903e8f103202882 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Tue, 29 Oct 2024 04:17:30 +0900
Subject: [PATCH 1/8] [VectorCombine] Combine extract/insert from vector

insert (DstVec, (extract SrcVec, ExtIdx), InsIdx)
--> shuffle (DstVec, SrcVec, Mask)

This commit combines extract/insert on a vector into Shuffle with
vector.
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 44 +++++++++++++
 .../X86/extract-binop-inseltpoison.ll         | 65 ++++++++++++-------
 .../VectorCombine/X86/extract-binop.ll        | 65 ++++++++++++-------
 .../VectorCombine/X86/extract-cmp.ll          |  3 +-
 .../VectorCombine/X86/load-inseltpoison.ll    | 39 +++++++----
 .../test/Transforms/VectorCombine/X86/load.ll | 39 +++++++----
 6 files changed, 183 insertions(+), 72 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 58145c7e3c5913..4cb293a4c59f49 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -106,6 +106,7 @@ class VectorCombine {
                        Instruction &I);
   bool foldExtractExtract(Instruction &I);
   bool foldInsExtFNeg(Instruction &I);
+  bool foldInsExtVectorToShuffle(Instruction &I);
   bool foldBitcastShuffle(Instruction &I);
   bool scalarizeBinopOrCmp(Instruction &I);
   bool scalarizeVPIntrinsic(Instruction &I);
@@ -2678,6 +2679,48 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) {
   return true;
 }
 
+/// insert (DstVec, (extract SrcVec, ExtIdx), InsIdx) -->
+/// shuffle (DstVec, SrcVec, Mask)
+bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
+  Value *DstVec, *SrcVec;
+  uint64_t ExtIdx, InsIdx;
+  if (!match(&I, m_InsertElt(m_Value(DstVec),
+                             m_OneUse(m_ExtractElt(m_Value(SrcVec),
+                                                   m_ConstantInt(ExtIdx))),
+                             m_ConstantInt(InsIdx))))
+    return false;
+
+  auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
+  if (!VecTy || SrcVec->getType() != VecTy)
+    return false;
+
+  unsigned NumElts = VecTy->getNumElements();
+  if (ExtIdx >= NumElts)
+    return false;
+
+  SmallVector<int> Mask(NumElts);
+  std::iota(Mask.begin(), Mask.end(), 0);
+  Mask[InsIdx] = ExtIdx + NumElts;
+  // Cost
+  ExtractElementInst *Ext;
+  if ((Ext = dyn_cast<ExtractElementInst>(I.getOperand(0))) == nullptr)
+    Ext = dyn_cast<ExtractElementInst>(I.getOperand(1));
+
+  TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+  InstructionCost OldCost =
+      TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+  InstructionCost NewCost =
+      TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+
+  if (OldCost < NewCost)
+    return false;
+
+  Value *Shuf = Builder.CreateShuffleVector(DstVec, SrcVec, Mask);
+  replaceValue(I, *Shuf);
+
+  return true;
+}
+
 /// This is the entry point for all transforms. Pass manager differences are
 /// handled in the callers of this function.
 bool VectorCombine::run() {
@@ -2734,6 +2777,7 @@ bool VectorCombine::run() {
       switch (Opcode) {
       case Instruction::InsertElement:
         MadeChange |= foldInsExtFNeg(I);
+        MadeChange |= foldInsExtVectorToShuffle(I);
         break;
       case Instruction::ShuffleVector:
         MadeChange |= foldShuffleOfBinops(I);
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index 3d69f15fc5f249..ed9029e14717ea 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -417,12 +417,18 @@ define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
 }
 
 define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -452,22 +458,37 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @PR34724(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; CHECK-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; CHECK-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> poison, float [[A23]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @PR34724(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; SSE-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; SSE-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x float> poison, float [[A23]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @PR34724(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 52f7cd859a1ab1..1d3177e5d83f38 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -417,12 +417,18 @@ define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
 }
 
 define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -452,22 +458,37 @@ define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
 }
 
 define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @PR34724(
-; CHECK-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
-; CHECK-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; CHECK-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
-; CHECK-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; CHECK-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; CHECK-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
-; CHECK-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[A23]], i32 1
-; CHECK-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @PR34724(
+; SSE-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; SSE-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; SSE-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
+; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[A23]], i32 1
+; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @PR34724(
+; AVX-NEXT:    [[A0:%.*]] = extractelement <4 x float> [[A:%.*]], i32 0
+; AVX-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
+; AVX-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
+; AVX-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
+; AVX-NEXT:    [[V1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; AVX-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
+; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a0 = extractelement <4 x float> %a, i32 0
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
index 64c86a741b1779..3dae93665b1ed7 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-cmp.ll
@@ -163,8 +163,7 @@ define <4 x i1> @ins_fcmp_ext_ext(<4 x float> %a, <4 x i1> %b) {
 ; AVX-LABEL: @ins_fcmp_ext_ext(
 ; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 poison>
 ; AVX-NEXT:    [[TMP1:%.*]] = fcmp ugt <4 x float> [[A]], [[SHIFT]]
-; AVX-NEXT:    [[A21:%.*]] = extractelement <4 x i1> [[TMP1]], i32 2
-; AVX-NEXT:    [[R:%.*]] = insertelement <4 x i1> [[B:%.*]], i1 [[A21]], i32 2
+; AVX-NEXT:    [[R:%.*]] = shufflevector <4 x i1> [[B:%.*]], <4 x i1> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 3>
 ; AVX-NEXT:    ret <4 x i1> [[R]]
 ;
   %a1 = extractelement <4 x float> %a, i32 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index c4aba63568e2ff..e99e21641531ab 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -537,19 +537,32 @@ define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16)
 
 declare ptr @getscaleptr()
 define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) nofree nosync {
-; CHECK-LABEL: @PR47558_multiple_use_load(
-; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
-; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
-; CHECK-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
-; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
-; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
-; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
-; CHECK-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
-; CHECK-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
-; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
-; CHECK-NEXT:    ret void
+; SSE2-LABEL: @PR47558_multiple_use_load(
+; SSE2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
+; SSE2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
+; SSE2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
+; SSE2-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
+; SSE2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
+; SSE2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
+; SSE2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
+; SSE2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
+; SSE2-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
+; SSE2-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
+; SSE2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; SSE2-NEXT:    ret void
+;
+; AVX2-LABEL: @PR47558_multiple_use_load(
+; AVX2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
+; AVX2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
+; AVX2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
+; AVX2-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
+; AVX2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
+; AVX2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
+; AVX2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
+; AVX2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; AVX2-NEXT:    ret void
 ;
   %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
   %op = load <2 x float>, ptr %opptr, align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index d5c19b35838d70..cf3ec41c2935e3 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -520,19 +520,32 @@ define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16)
 
 declare ptr @getscaleptr()
 define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) {
-; CHECK-LABEL: @PR47558_multiple_use_load(
-; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
-; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
-; CHECK-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
-; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
-; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
-; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
-; CHECK-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
-; CHECK-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
-; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
-; CHECK-NEXT:    ret void
+; SSE2-LABEL: @PR47558_multiple_use_load(
+; SSE2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
+; SSE2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
+; SSE2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
+; SSE2-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
+; SSE2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
+; SSE2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
+; SSE2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
+; SSE2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
+; SSE2-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
+; SSE2-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
+; SSE2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; SSE2-NEXT:    ret void
+;
+; AVX2-LABEL: @PR47558_multiple_use_load(
+; AVX2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
+; AVX2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
+; AVX2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
+; AVX2-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
+; AVX2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
+; AVX2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
+; AVX2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
+; AVX2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
+; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
+; AVX2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; AVX2-NEXT:    ret void
 ;
   %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
   %op = load <2 x float>, ptr %opptr, align 4

>From ed03d11823ed4e4c58d7f97fdaa6040dd4ff18f1 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Fri, 8 Nov 2024 00:30:38 +0900
Subject: [PATCH 2/8] use isa to get ExtractElts from current Instruction

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 4cb293a4c59f49..b1d6d3cdd4d3a5 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2703,8 +2703,9 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
   Mask[InsIdx] = ExtIdx + NumElts;
   // Cost
   ExtractElementInst *Ext;
-  if ((Ext = dyn_cast<ExtractElementInst>(I.getOperand(0))) == nullptr)
-    Ext = dyn_cast<ExtractElementInst>(I.getOperand(1));
+  Ext = isa<ExtractElementInst>(I.getOperand(0))
+            ? cast<ExtractElementInst>(I.getOperand(0))
+            : cast<ExtractElementInst>(I.getOperand(1));
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost OldCost =

>From 93364657c6660e16be0eda70fc5a36f282ad76cc Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Fri, 8 Nov 2024 00:48:04 +0900
Subject: [PATCH 3/8] use SmallVector with initial value

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b1d6d3cdd4d3a5..3a2b33adc7d5a8 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2698,7 +2698,7 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
   if (ExtIdx >= NumElts)
     return false;
 
-  SmallVector<int> Mask(NumElts);
+  SmallVector<int> Mask(NumElts, 0);
   std::iota(Mask.begin(), Mask.end(), 0);
   Mask[InsIdx] = ExtIdx + NumElts;
   // Cost

>From 60c4bb1d0b94cc258b99e3cc6b8d4dfe8c80ddc8 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 9 Nov 2024 03:16:13 +0900
Subject: [PATCH 4/8] add combining condition for InsIdx >= NumEls

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 3a2b33adc7d5a8..72bbcef4d5394c 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2695,7 +2695,7 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
     return false;
 
   unsigned NumElts = VecTy->getNumElements();
-  if (ExtIdx >= NumElts)
+  if (ExtIdx >= NumElts || InsIdx >= NumElts)
     return false;
 
   SmallVector<int> Mask(NumElts, 0);

>From f9d0f2f63f81c7f72a28ccea39d17cbd9f652767 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 9 Nov 2024 03:17:30 +0900
Subject: [PATCH 5/8] Change the way to get ExtElts

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 72bbcef4d5394c..a2645588accfef 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2702,10 +2702,8 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
   std::iota(Mask.begin(), Mask.end(), 0);
   Mask[InsIdx] = ExtIdx + NumElts;
   // Cost
-  ExtractElementInst *Ext;
-  Ext = isa<ExtractElementInst>(I.getOperand(0))
-            ? cast<ExtractElementInst>(I.getOperand(0))
-            : cast<ExtractElementInst>(I.getOperand(1));
+  auto *Ins = cast<InsertElementInst>(&I);
+  auto *Ext = cast<ExtractElementInst>(I.getOperand(1));
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost OldCost =

>From 2eea29da756d77d6b5c994a830a6b2dffbe8cb9a Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 9 Nov 2024 03:18:46 +0900
Subject: [PATCH 6/8] Correcting incorrect cost calculation

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index a2645588accfef..7d3328e1ffa037 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2707,9 +2707,11 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
 
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   InstructionCost OldCost =
-      TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
+      TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx) +
+      TTI.getVectorInstrCost(*Ins, VecTy, CostKind, InsIdx);
+
   InstructionCost NewCost =
-      TTI.getShuffleCost(TargetTransformInfo::SK_Select, VecTy, Mask);
+      TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask);
 
   if (OldCost < NewCost)
     return false;

>From 82ff3018aeb7686574669b20e679f60f47c416d7 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 9 Nov 2024 03:20:12 +0900
Subject: [PATCH 7/8] Handling the case when ExtElt is not the OneUse

---
 llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 7d3328e1ffa037..113311ffef785d 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -2684,10 +2684,10 @@ bool VectorCombine::shrinkType(llvm::Instruction &I) {
 bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
   Value *DstVec, *SrcVec;
   uint64_t ExtIdx, InsIdx;
-  if (!match(&I, m_InsertElt(m_Value(DstVec),
-                             m_OneUse(m_ExtractElt(m_Value(SrcVec),
-                                                   m_ConstantInt(ExtIdx))),
-                             m_ConstantInt(InsIdx))))
+  if (!match(&I,
+             m_InsertElt(m_Value(DstVec),
+                         m_ExtractElt(m_Value(SrcVec), m_ConstantInt(ExtIdx)),
+                         m_ConstantInt(InsIdx))))
     return false;
 
   auto *VecTy = dyn_cast<FixedVectorType>(I.getType());
@@ -2712,6 +2712,8 @@ bool VectorCombine::foldInsExtVectorToShuffle(Instruction &I) {
 
   InstructionCost NewCost =
       TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, VecTy, Mask);
+  if (!Ext->hasOneUse())
+    NewCost += TTI.getVectorInstrCost(*Ext, VecTy, CostKind, ExtIdx);
 
   if (OldCost < NewCost)
     return false;

>From 78bf832be4480eaca6cb161a24f8093fa8537ce9 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 9 Nov 2024 03:22:25 +0900
Subject: [PATCH 8/8] update testcases

---
 .../X86/extract-binop-inseltpoison.ll         | 48 +++++++++----------
 .../VectorCombine/X86/extract-binop.ll        | 48 +++++++++----------
 .../VectorCombine/X86/load-inseltpoison.ll    | 38 +++++----------
 .../test/Transforms/VectorCombine/X86/load.ll | 38 +++++----------
 4 files changed, 70 insertions(+), 102 deletions(-)

diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
index ed9029e14717ea..bbf0db677461e4 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop-inseltpoison.ll
@@ -417,18 +417,11 @@ define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
 }
 
 define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @ins_bo_ext_ext(
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @ins_bo_ext_ext(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @ins_bo_ext_ext(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -441,13 +434,21 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
 ;       but it is likely that extracting from index 3 is the better option.
 
 define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext_uses(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    call void @use_f32(float [[A23]])
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext_uses(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    call void @use_f32(float [[A23]])
+; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext_uses(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX-NEXT:    call void @use_f32(float [[A23]])
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -463,16 +464,13 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
 ; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
 ; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
 ; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
 ; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
 ; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x float> poison, float [[A23]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
+; SSE-NEXT:    [[V1:%.*]] = shufflevector <4 x float> poison, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 ; SSE-NEXT:    ret <4 x float> [[V3]]
 ;
 ; AVX-LABEL: @PR34724(
diff --git a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
index 1d3177e5d83f38..284d2859304eb8 100644
--- a/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/extract-binop.ll
@@ -417,18 +417,11 @@ define float @ext14_ext15_fmul_v16f32(<16 x float> %x) {
 }
 
 define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
-; SSE-LABEL: @ins_bo_ext_ext(
-; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i64 3
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; SSE-NEXT:    ret <4 x float> [[V3]]
-;
-; AVX-LABEL: @ins_bo_ext_ext(
-; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
-; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
-; AVX-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
-; AVX-NEXT:    ret <4 x float> [[V3]]
+; CHECK-LABEL: @ins_bo_ext_ext(
+; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[A]]
+; CHECK-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
+; CHECK-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -441,13 +434,21 @@ define <4 x float> @ins_bo_ext_ext(<4 x float> %a, <4 x float> %b) {
 ;       but it is likely that extracting from index 3 is the better option.
 
 define <4 x float> @ins_bo_ext_ext_uses(<4 x float> %a, <4 x float> %b) {
-; CHECK-LABEL: @ins_bo_ext_ext_uses(
-; CHECK-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
-; CHECK-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; CHECK-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
-; CHECK-NEXT:    call void @use_f32(float [[A23]])
-; CHECK-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
-; CHECK-NEXT:    ret <4 x float> [[V3]]
+; SSE-LABEL: @ins_bo_ext_ext_uses(
+; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; SSE-NEXT:    call void @use_f32(float [[A23]])
+; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+; SSE-NEXT:    ret <4 x float> [[V3]]
+;
+; AVX-LABEL: @ins_bo_ext_ext_uses(
+; AVX-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
+; AVX-NEXT:    call void @use_f32(float [[A23]])
+; AVX-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[B:%.*]], float [[A23]], i32 3
+; AVX-NEXT:    ret <4 x float> [[V3]]
 ;
   %a2 = extractelement <4 x float> %a, i32 2
   %a3 = extractelement <4 x float> %a, i32 3
@@ -463,16 +464,13 @@ define <4 x float> @PR34724(<4 x float> %a, <4 x float> %b) {
 ; SSE-NEXT:    [[A1:%.*]] = extractelement <4 x float> [[A]], i32 1
 ; SSE-NEXT:    [[SHIFT:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
 ; SSE-NEXT:    [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
-; SSE-NEXT:    [[A23:%.*]] = extractelement <4 x float> [[TMP1]], i32 2
 ; SSE-NEXT:    [[SHIFT1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
 ; SSE-NEXT:    [[TMP2:%.*]] = fadd <4 x float> [[B]], [[SHIFT1]]
-; SSE-NEXT:    [[B01:%.*]] = extractelement <4 x float> [[TMP2]], i32 0
 ; SSE-NEXT:    [[SHIFT2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
 ; SSE-NEXT:    [[TMP3:%.*]] = fadd <4 x float> [[SHIFT2]], [[B]]
-; SSE-NEXT:    [[B23:%.*]] = extractelement <4 x float> [[TMP3]], i64 3
-; SSE-NEXT:    [[V1:%.*]] = insertelement <4 x float> undef, float [[A23]], i32 1
-; SSE-NEXT:    [[V2:%.*]] = insertelement <4 x float> [[V1]], float [[B01]], i32 2
-; SSE-NEXT:    [[V3:%.*]] = insertelement <4 x float> [[V2]], float [[B23]], i32 3
+; SSE-NEXT:    [[V1:%.*]] = shufflevector <4 x float> undef, <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 6, i32 2, i32 3>
+; SSE-NEXT:    [[V2:%.*]] = shufflevector <4 x float> [[V1]], <4 x float> [[TMP2]], <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+; SSE-NEXT:    [[V3:%.*]] = shufflevector <4 x float> [[V2]], <4 x float> [[TMP3]], <4 x i32> <i32 0, i32 1, i32 2, i32 7>
 ; SSE-NEXT:    ret <4 x float> [[V3]]
 ;
 ; AVX-LABEL: @PR34724(
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index e99e21641531ab..937d4043adc0c4 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -537,32 +537,18 @@ define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16)
 
 declare ptr @getscaleptr()
 define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) nofree nosync {
-; SSE2-LABEL: @PR47558_multiple_use_load(
-; SSE2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
-; SSE2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
-; SSE2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
-; SSE2-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
-; SSE2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
-; SSE2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; SSE2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; SSE2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
-; SSE2-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
-; SSE2-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
-; SSE2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
-; SSE2-NEXT:    ret void
-;
-; AVX2-LABEL: @PR47558_multiple_use_load(
-; AVX2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
-; AVX2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
-; AVX2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
-; AVX2-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
-; AVX2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
-; AVX2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; AVX2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; AVX2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
-; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
-; AVX2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
-; AVX2-NEXT:    ret void
+; CHECK-LABEL: @PR47558_multiple_use_load(
+; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
+; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
+; CHECK-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> poison, float [[SCALE]], i32 0
+; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
+; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
+; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
+; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> poison, float [[T4]], i32 0
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; CHECK-NEXT:    ret void
 ;
   %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
   %op = load <2 x float>, ptr %opptr, align 4
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index cf3ec41c2935e3..bdd05a1a37c70f 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -520,32 +520,18 @@ define <2 x float> @load_f32_insert_v2f32_asan(ptr align 16 dereferenceable(16)
 
 declare ptr @getscaleptr()
 define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr nocapture nonnull readonly %opptr) {
-; SSE2-LABEL: @PR47558_multiple_use_load(
-; SSE2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
-; SSE2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
-; SSE2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
-; SSE2-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
-; SSE2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
-; SSE2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; SSE2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; SSE2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
-; SSE2-NEXT:    [[T5:%.*]] = extractelement <2 x float> [[T3]], i32 1
-; SSE2-NEXT:    [[RESULT1:%.*]] = insertelement <2 x float> [[RESULT0]], float [[T5]], i32 1
-; SSE2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
-; SSE2-NEXT:    ret void
-;
-; AVX2-LABEL: @PR47558_multiple_use_load(
-; AVX2-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
-; AVX2-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
-; AVX2-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
-; AVX2-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
-; AVX2-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
-; AVX2-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
-; AVX2-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
-; AVX2-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
-; AVX2-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
-; AVX2-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
-; AVX2-NEXT:    ret void
+; CHECK-LABEL: @PR47558_multiple_use_load(
+; CHECK-NEXT:    [[SCALEPTR:%.*]] = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
+; CHECK-NEXT:    [[OP:%.*]] = load <2 x float>, ptr [[OPPTR:%.*]], align 4
+; CHECK-NEXT:    [[SCALE:%.*]] = load float, ptr [[SCALEPTR]], align 16
+; CHECK-NEXT:    [[T1:%.*]] = insertelement <2 x float> undef, float [[SCALE]], i32 0
+; CHECK-NEXT:    [[T2:%.*]] = insertelement <2 x float> [[T1]], float [[SCALE]], i32 1
+; CHECK-NEXT:    [[T3:%.*]] = fmul <2 x float> [[OP]], [[T2]]
+; CHECK-NEXT:    [[T4:%.*]] = extractelement <2 x float> [[T3]], i32 0
+; CHECK-NEXT:    [[RESULT0:%.*]] = insertelement <2 x float> undef, float [[T4]], i32 0
+; CHECK-NEXT:    [[RESULT1:%.*]] = shufflevector <2 x float> [[RESULT0]], <2 x float> [[T3]], <2 x i32> <i32 0, i32 3>
+; CHECK-NEXT:    store <2 x float> [[RESULT1]], ptr [[RESULTPTR:%.*]], align 8
+; CHECK-NEXT:    ret void
 ;
   %scaleptr = tail call nonnull align 16 dereferenceable(64) ptr @getscaleptr()
   %op = load <2 x float>, ptr %opptr, align 4



More information about the llvm-commits mailing list