[clang] 05dbdb0 - Revert "[InstCombine] canonicalize trunc + insert as bitcast + shuffle, part 1 (2nd try)"

Thu Dec 8 11:18:26 PST 2022

Author: Sanjay Patel
Date: 2022-12-08T14:16:46-05:00
New Revision: 05dbdb0088a3f5541d9e91c61a564d0aa4704f4f

URL: https://github.com/llvm/llvm-project/commit/05dbdb0088a3f5541d9e91c61a564d0aa4704f4f
DIFF: https://github.com/llvm/llvm-project/commit/05dbdb0088a3f5541d9e91c61a564d0aa4704f4f.diff

LOG: Revert "[InstCombine] canonicalize trunc + insert as bitcast + shuffle, part 1 (2nd try)"

This reverts commit e71b81cab09bf33e3b08ed600418b72cc4117461.

As discussed in the planned follow-on to this patch (D138874),
this and the subsequent patches in this set can cause trouble for
the backend, and there's probably no quick fix. We may even
want to canonicalize in the opposite direction (towards insertelt).

Added: 
    

Modified: 
    clang/test/Headers/wasm.c
    llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
    llvm/test/Transforms/InstCombine/insert-trunc.ll
    llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
    llvm/test/Transforms/InstCombine/vec_phi_extract.ll
    llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
    llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll

Removed: 
    


################################################################################
diff  --git a/clang/test/Headers/wasm.c b/clang/test/Headers/wasm.c
index 79dc67eaa4ef8..53acbf4de4c96 100644

--- a/clang/test/Headers/wasm.c
+++ b/clang/test/Headers/wasm.c
@@ -1475,8 +1475,8 @@ v128_t test_f64x2_ge(v128_t a, v128_t b) {
 
 // CHECK-LABEL: @test_v128_not(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[A:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
-// CHECK-NEXT:    ret <4 x i32> [[NOT_I]]
+// CHECK-NEXT:    [[NEG_I:%.*]] = xor <4 x i32> [[A:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    ret <4 x i32> [[NEG_I]]
 //
 v128_t test_v128_not(v128_t a) {
   return wasm_v128_not(a);
@@ -1511,8 +1511,8 @@ v128_t test_v128_xor(v128_t a, v128_t b) {
 
 // CHECK-LABEL: @test_v128_andnot(
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    [[NOT_I:%.*]] = xor <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
-// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[NOT_I]], [[A:%.*]]
+// CHECK-NEXT:    [[NEG_I:%.*]] = xor <4 x i32> [[B:%.*]], <i32 -1, i32 -1, i32 -1, i32 -1>
+// CHECK-NEXT:    [[AND_I:%.*]] = and <4 x i32> [[NEG_I]], [[A:%.*]]
 // CHECK-NEXT:    ret <4 x i32> [[AND_I]]
 //
 v128_t test_v128_andnot(v128_t a, v128_t b) {
@@ -1596,11 +1596,12 @@ v128_t test_i8x16_popcnt(v128_t a) {
 // CHECK-LABEL: @test_i8x16_shl(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VEC___B_I:%.*]] = bitcast i32 [[B:%.*]] to <4 x i8>
-// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[VEC___B_I]], <4 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i64 0
+// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHL_I:%.*]] = shl <16 x i8> [[TMP0]], [[SH_PROM_I]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[SHL_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[SHL_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i8x16_shl(v128_t a, uint32_t b) {
   return wasm_i8x16_shl(a, b);
@@ -1609,11 +1610,12 @@ v128_t test_i8x16_shl(v128_t a, uint32_t b) {
 // CHECK-LABEL: @test_i8x16_shr(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VEC___B_I:%.*]] = bitcast i32 [[B:%.*]] to <4 x i8>
-// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[VEC___B_I]], <4 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i64 0
+// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHR_I:%.*]] = ashr <16 x i8> [[TMP0]], [[SH_PROM_I]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[SHR_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[SHR_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i8x16_shr(v128_t a, uint32_t b) {
   return wasm_i8x16_shr(a, b);
@@ -1622,11 +1624,12 @@ v128_t test_i8x16_shr(v128_t a, uint32_t b) {
 // CHECK-LABEL: @test_u8x16_shr(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <16 x i8>
-// CHECK-NEXT:    [[VEC___B_I:%.*]] = bitcast i32 [[B:%.*]] to <4 x i8>
-// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <4 x i8> [[VEC___B_I]], <4 x i8> poison, <16 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i8
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <16 x i8> undef, i8 [[TMP1]], i64 0
+// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <16 x i8> [[TMP2]], <16 x i8> poison, <16 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHR_I:%.*]] = lshr <16 x i8> [[TMP0]], [[SH_PROM_I]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <16 x i8> [[SHR_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <16 x i8> [[SHR_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_u8x16_shr(v128_t a, uint32_t b) {
   return wasm_u8x16_shr(a, b);
@@ -1810,11 +1813,12 @@ uint32_t test_i16x8_bitmask(v128_t a) {
 // CHECK-LABEL: @test_i16x8_shl(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[VEC___B_I:%.*]] = bitcast i32 [[B:%.*]] to <2 x i16>
-// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[VEC___B_I]], <2 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i16> undef, i16 [[TMP1]], i64 0
+// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHL_I:%.*]] = shl <8 x i16> [[TMP0]], [[SH_PROM_I]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHL_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[SHL_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i16x8_shl(v128_t a, uint32_t b) {
   return wasm_i16x8_shl(a, b);
@@ -1823,11 +1827,12 @@ v128_t test_i16x8_shl(v128_t a, uint32_t b) {
 // CHECK-LABEL: @test_i16x8_shr(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[VEC___B_I:%.*]] = bitcast i32 [[B:%.*]] to <2 x i16>
-// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[VEC___B_I]], <2 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i16> undef, i16 [[TMP1]], i64 0
+// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHR_I:%.*]] = ashr <8 x i16> [[TMP0]], [[SH_PROM_I]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHR_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[SHR_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_i16x8_shr(v128_t a, uint32_t b) {
   return wasm_i16x8_shr(a, b);
@@ -1836,11 +1841,12 @@ v128_t test_i16x8_shr(v128_t a, uint32_t b) {
 // CHECK-LABEL: @test_u16x8_shr(
 // CHECK-NEXT:  entry:
 // CHECK-NEXT:    [[TMP0:%.*]] = bitcast <4 x i32> [[A:%.*]] to <8 x i16>
-// CHECK-NEXT:    [[VEC___B_I:%.*]] = bitcast i32 [[B:%.*]] to <2 x i16>
-// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <2 x i16> [[VEC___B_I]], <2 x i16> poison, <8 x i32> zeroinitializer
+// CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[B:%.*]] to i16
+// CHECK-NEXT:    [[TMP2:%.*]] = insertelement <8 x i16> undef, i16 [[TMP1]], i64 0
+// CHECK-NEXT:    [[SH_PROM_I:%.*]] = shufflevector <8 x i16> [[TMP2]], <8 x i16> poison, <8 x i32> zeroinitializer
 // CHECK-NEXT:    [[SHR_I:%.*]] = lshr <8 x i16> [[TMP0]], [[SH_PROM_I]]
-// CHECK-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[SHR_I]] to <4 x i32>
-// CHECK-NEXT:    ret <4 x i32> [[TMP1]]
+// CHECK-NEXT:    [[TMP3:%.*]] = bitcast <8 x i16> [[SHR_I]] to <4 x i32>
+// CHECK-NEXT:    ret <4 x i32> [[TMP3]]
 //
 v128_t test_u16x8_shr(v128_t a, uint32_t b) {
   return wasm_u16x8_shr(a, b);

diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index d45b46599f604..bfc45d1627db1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -39,7 +39,6 @@
 #include <cassert>
 #include <cstdint>
 #include <iterator>
-#include <numeric>
 #include <utility>
 
 #define DEBUG_TYPE "instcombine"
@@ -1516,66 +1515,6 @@ static Instruction *narrowInsElt(InsertElementInst &InsElt,
   return CastInst::Create(CastOpcode, NewInsElt, InsElt.getType());
 }
 
-/// Try to convert scalar extraction ops (shift+trunc) with insertelt to
-/// bitcast and shuffle:
-/// inselt V, (lshr (trunc X)), IndexC --> shuffle (bitcast X), V, Mask
-static Instruction *foldTruncInsElt(InsertElementInst &InsElt, bool IsBigEndian,
-                                    InstCombiner::BuilderTy &Builder) {
-  // inselt undef, (trunc T), IndexC
-  // TODO: Allow any base vector value.
-  // TODO: The one-use limitation could be removed for some cases (eg, no
-  //       extra shuffle is needed and a shift is eliminated).
-  auto *VTy = dyn_cast<FixedVectorType>(InsElt.getType());
-  Value *T, *V = InsElt.getOperand(0);
-  uint64_t IndexC;
-  if (!VTy || !match(InsElt.getOperand(1), m_OneUse(m_Trunc(m_Value(T)))) ||
-      !match(InsElt.getOperand(2), m_ConstantInt(IndexC)) ||
-      !match(V, m_Undef()))
-    return nullptr;
-
-  Type *SrcTy = T->getType();
-  unsigned ScalarWidth = SrcTy->getScalarSizeInBits();
-  unsigned VecEltWidth = VTy->getScalarSizeInBits();
-  if (ScalarWidth % VecEltWidth != 0)
-    return nullptr;
-
-  unsigned NumEltsInScalar = ScalarWidth / VecEltWidth;
-  Value *X = T;
-  if ((IsBigEndian && IndexC == NumEltsInScalar - 1) ||
-      (!IsBigEndian && IndexC == 0)) {
-    // The insert is to the LSB end of the vector (depends on endian).
-    // That's all we need.
-  } else {
-    // TODO: Look through a shift-right and translate the insert index.
-    return nullptr;
-  }
-
-  // Bitcast the scalar to a vector type with the destination element type.
-  Type *CastTy = FixedVectorType::get(VTy->getElementType(), NumEltsInScalar);
-  Value *VecX = Builder.CreateBitCast(X, CastTy, "vec." + X->getName());
-
-  unsigned NumElts = VTy->getNumElements();
-  if (NumElts > NumEltsInScalar) {
-    // Pad the source vector with undef elements, so it matches the dest type.
-    SmallVector<int> IdentityPaddedMask(NumElts, UndefMaskElem);
-    for (unsigned i = 0; i != NumEltsInScalar; ++i)
-      IdentityPaddedMask[i] = i;
-    VecX = Builder.CreateShuffleVector(VecX, IdentityPaddedMask);
-  } else if (NumElts < NumEltsInScalar) {
-    // Narrow the source vector, so it matches the dest type.
-    SmallVector<int> IdentityExtractMask(NumElts);
-    std::iota(IdentityExtractMask.begin(), IdentityExtractMask.end(), 0);
-    VecX = Builder.CreateShuffleVector(VecX, IdentityExtractMask);
-  }
-
-  // Insert the truncated element using a select-shuffle. All lanes but one are
-  // from the base vector V.
-  SmallVector<int> SelectMask(NumElts);
-  std::iota(SelectMask.begin(), SelectMask.end(), 0);
-  SelectMask[IndexC] = (int)IndexC + NumElts;
-  return new ShuffleVectorInst(V, VecX, SelectMask);
-}
-
 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   Value *VecOp    = IE.getOperand(0);
   Value *ScalarOp = IE.getOperand(1);
@@ -1703,9 +1642,6 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *Ext = narrowInsElt(IE, Builder))
     return Ext;
 
-  if (Instruction *Shuf = foldTruncInsElt(IE, DL.isBigEndian(), Builder))
-    return Shuf;
-
   return nullptr;
 }
 

diff  --git a/llvm/test/Transforms/InstCombine/insert-trunc.ll b/llvm/test/Transforms/InstCombine/insert-trunc.ll
index 20922f97648b9..3ae128e55b43b 100644
--- a/llvm/test/Transforms/InstCombine/insert-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/insert-trunc.ll
@@ -1,20 +1,15 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL,BE
-; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL,LE
+; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL
+; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL
 
 declare void @use(i8)
 declare void @use64(i64)
 
 define <4 x i16> @low_index_same_length_poison_basevec(i64 %x) {
-; BE-LABEL: @low_index_same_length_poison_basevec(
-; BE-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
-; BE-NEXT:    [[R:%.*]] = insertelement <4 x i16> poison, i16 [[T]], i64 0
-; BE-NEXT:    ret <4 x i16> [[R]]
-;
-; LE-LABEL: @low_index_same_length_poison_basevec(
-; LE-NEXT:    [[VEC_X:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; LE-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[VEC_X]], <4 x i16> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; LE-NEXT:    ret <4 x i16> [[R]]
+; ALL-LABEL: @low_index_same_length_poison_basevec(
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
+; ALL-NEXT:    [[R:%.*]] = insertelement <4 x i16> poison, i16 [[T]], i64 0
+; ALL-NEXT:    ret <4 x i16> [[R]]
 ;
   %t = trunc i64 %x to i16
   %r = insertelement <4 x i16> poison, i16 %t, i64 0
@@ -22,15 +17,10 @@ define <4 x i16> @low_index_same_length_poison_basevec(i64 %x) {
 }
 
 define <4 x i16> @high_index_same_length_poison_basevec(i64 %x) {
-; BE-LABEL: @high_index_same_length_poison_basevec(
-; BE-NEXT:    [[VEC_X:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; BE-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[VEC_X]], <4 x i16> poison, <4 x i32> <i32 undef, i32 undef, i32 undef, i32 3>
-; BE-NEXT:    ret <4 x i16> [[R]]
-;
-; LE-LABEL: @high_index_same_length_poison_basevec(
-; LE-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
-; LE-NEXT:    [[R:%.*]] = insertelement <4 x i16> poison, i16 [[T]], i64 3
-; LE-NEXT:    ret <4 x i16> [[R]]
+; ALL-LABEL: @high_index_same_length_poison_basevec(
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
+; ALL-NEXT:    [[R:%.*]] = insertelement <4 x i16> poison, i16 [[T]], i64 3
+; ALL-NEXT:    ret <4 x i16> [[R]]
 ;
   %t = trunc i64 %x to i16
   %r = insertelement <4 x i16> poison, i16 %t, i64 3
@@ -49,15 +39,10 @@ define <4 x i16> @wrong_index_same_length_poison_basevec(i64 %x) {
 }
 
 define <8 x i16> @low_index_longer_length_poison_basevec(i64 %x) {
-; BE-LABEL: @low_index_longer_length_poison_basevec(
-; BE-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
-; BE-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[T]], i64 0
-; BE-NEXT:    ret <8 x i16> [[R]]
-;
-; LE-LABEL: @low_index_longer_length_poison_basevec(
-; LE-NEXT:    [[VEC_X:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; LE-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[VEC_X]], <4 x i16> poison, <8 x i32> <i32 0, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
-; LE-NEXT:    ret <8 x i16> [[R]]
+; ALL-LABEL: @low_index_longer_length_poison_basevec(
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
+; ALL-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[T]], i64 0
+; ALL-NEXT:    ret <8 x i16> [[R]]
 ;
   %t = trunc i64 %x to i16
   %r = insertelement <8 x i16> poison, i16 %t, i64 0
@@ -65,15 +50,10 @@ define <8 x i16> @low_index_longer_length_poison_basevec(i64 %x) {
 }
 
 define <8 x i16> @high_index_longer_length_poison_basevec(i64 %x) {
-; BE-LABEL: @high_index_longer_length_poison_basevec(
-; BE-NEXT:    [[VEC_X:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; BE-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[VEC_X]], <4 x i16> poison, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; BE-NEXT:    ret <8 x i16> [[R]]
-;
-; LE-LABEL: @high_index_longer_length_poison_basevec(
-; LE-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
-; LE-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[T]], i64 3
-; LE-NEXT:    ret <8 x i16> [[R]]
+; ALL-LABEL: @high_index_longer_length_poison_basevec(
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
+; ALL-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[T]], i64 3
+; ALL-NEXT:    ret <8 x i16> [[R]]
 ;
   %t = trunc i64 %x to i16
   %r = insertelement <8 x i16> poison, i16 %t, i64 3
@@ -92,15 +72,10 @@ define <8 x i16> @wrong_index_longer_length_poison_basevec(i64 %x) {
 }
 
 define <2 x i16> @low_index_shorter_length_poison_basevec(i64 %x) {
-; BE-LABEL: @low_index_shorter_length_poison_basevec(
-; BE-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
-; BE-NEXT:    [[R:%.*]] = insertelement <2 x i16> poison, i16 [[T]], i64 0
-; BE-NEXT:    ret <2 x i16> [[R]]
-;
-; LE-LABEL: @low_index_shorter_length_poison_basevec(
-; LE-NEXT:    [[VEC_X:%.*]] = bitcast i64 [[X:%.*]] to <4 x i16>
-; LE-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[VEC_X]], <4 x i16> poison, <2 x i32> <i32 0, i32 undef>
-; LE-NEXT:    ret <2 x i16> [[R]]
+; ALL-LABEL: @low_index_shorter_length_poison_basevec(
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[X:%.*]] to i16
+; ALL-NEXT:    [[R:%.*]] = insertelement <2 x i16> poison, i16 [[T]], i64 0
+; ALL-NEXT:    ret <2 x i16> [[R]]
 ;
   %t = trunc i64 %x to i16
   %r = insertelement <2 x i16> poison, i16 %t, i64 0
@@ -169,17 +144,11 @@ define <4 x i16> @lshr_same_length_poison_basevec_be(i64 %x) {
 }
 
 define <4 x i16> @lshr_same_length_poison_basevec_both_endian(i64 %x) {
-; BE-LABEL: @lshr_same_length_poison_basevec_both_endian(
-; BE-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 48
-; BE-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i16
-; BE-NEXT:    [[R:%.*]] = insertelement <4 x i16> poison, i16 [[T]], i64 0
-; BE-NEXT:    ret <4 x i16> [[R]]
-;
-; LE-LABEL: @lshr_same_length_poison_basevec_both_endian(
-; LE-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 48
-; LE-NEXT:    [[VEC_S:%.*]] = bitcast i64 [[S]] to <4 x i16>
-; LE-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[VEC_S]], <4 x i16> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; LE-NEXT:    ret <4 x i16> [[R]]
+; ALL-LABEL: @lshr_same_length_poison_basevec_both_endian(
+; ALL-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 48
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i16
+; ALL-NEXT:    [[R:%.*]] = insertelement <4 x i16> poison, i16 [[T]], i64 0
+; ALL-NEXT:    ret <4 x i16> [[R]]
 ;
   %s = lshr i64 %x, 48
   %t = trunc i64 %s to i16
@@ -201,17 +170,11 @@ define <4 x i16> @lshr_wrong_index_same_length_poison_basevec(i64 %x) {
 }
 
 define <8 x i16> @lshr_longer_length_poison_basevec_le(i64 %x) {
-; BE-LABEL: @lshr_longer_length_poison_basevec_le(
-; BE-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 48
-; BE-NEXT:    [[VEC_S:%.*]] = bitcast i64 [[S]] to <4 x i16>
-; BE-NEXT:    [[R:%.*]] = shufflevector <4 x i16> [[VEC_S]], <4 x i16> poison, <8 x i32> <i32 undef, i32 undef, i32 undef, i32 3, i32 undef, i32 undef, i32 undef, i32 undef>
-; BE-NEXT:    ret <8 x i16> [[R]]
-;
-; LE-LABEL: @lshr_longer_length_poison_basevec_le(
-; LE-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 48
-; LE-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i16
-; LE-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[T]], i64 3
-; LE-NEXT:    ret <8 x i16> [[R]]
+; ALL-LABEL: @lshr_longer_length_poison_basevec_le(
+; ALL-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 48
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i16
+; ALL-NEXT:    [[R:%.*]] = insertelement <8 x i16> poison, i16 [[T]], i64 3
+; ALL-NEXT:    ret <8 x i16> [[R]]
 ;
   %s = lshr i64 %x, 48
   %t = trunc i64 %s to i16
@@ -285,17 +248,11 @@ define <4 x i8> @lshr_wrong_index_shorter_length_poison_basevec(i64 %x) {
 }
 
 define <4 x i8> @lshr_wrong_shift_shorter_length_poison_basevec(i64 %x) {
-; BE-LABEL: @lshr_wrong_shift_shorter_length_poison_basevec(
-; BE-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 57
-; BE-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i8
-; BE-NEXT:    [[R:%.*]] = insertelement <4 x i8> poison, i8 [[T]], i64 0
-; BE-NEXT:    ret <4 x i8> [[R]]
-;
-; LE-LABEL: @lshr_wrong_shift_shorter_length_poison_basevec(
-; LE-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 57
-; LE-NEXT:    [[VEC_S:%.*]] = bitcast i64 [[S]] to <8 x i8>
-; LE-NEXT:    [[R:%.*]] = shufflevector <8 x i8> [[VEC_S]], <8 x i8> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; LE-NEXT:    ret <4 x i8> [[R]]
+; ALL-LABEL: @lshr_wrong_shift_shorter_length_poison_basevec(
+; ALL-NEXT:    [[S:%.*]] = lshr i64 [[X:%.*]], 57
+; ALL-NEXT:    [[T:%.*]] = trunc i64 [[S]] to i8
+; ALL-NEXT:    [[R:%.*]] = insertelement <4 x i8> poison, i8 [[T]], i64 0
+; ALL-NEXT:    ret <4 x i8> [[R]]
 ;
   %s = lshr i64 %x, 57
   %t = trunc i64 %s to i8

diff  --git a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
index 2fd7b4bdffa49..79c3d37cd53c3 100644
--- a/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
+++ b/llvm/test/Transforms/InstCombine/vec_phi_extract-inseltpoison.ll
@@ -83,20 +83,21 @@ ret:
 define void @nocopy(i64 %val, i32  %limit, ptr %ptr) {
 ; CHECK-LABEL: @nocopy(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VEC_VAL:%.*]] = bitcast i64 [[VAL:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[VEC_VAL]], <2 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> undef, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <16 x i32> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ELT:%.*]] = extractelement <16 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <16 x i32> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ELT:%.*]] = extractelement <16 x i32> [[TMP4]], i64 0
+; CHECK-NEXT:    [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP4]], i64 1
 ; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[ELT]], [[LIMIT:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[ELTCOPY]], 10
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[ELT]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[INC]] = add <16 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[ELTCOPY]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[ELT]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INC]] = add <16 x i32> [[TMP4]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
 ; CHECK:       ret:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/InstCombine/vec_phi_extract.ll b/llvm/test/Transforms/InstCombine/vec_phi_extract.ll
index 76ba2eb007b05..1bdc21724be51 100644
--- a/llvm/test/Transforms/InstCombine/vec_phi_extract.ll
+++ b/llvm/test/Transforms/InstCombine/vec_phi_extract.ll
@@ -83,20 +83,21 @@ ret:
 define void @nocopy(i64 %val, i32  %limit, ptr %ptr) {
 ; CHECK-LABEL: @nocopy(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VEC_VAL:%.*]] = bitcast i64 [[VAL:%.*]] to <2 x i32>
-; CHECK-NEXT:    [[TMP0:%.*]] = shufflevector <2 x i32> [[VEC_VAL]], <2 x i32> poison, <16 x i32> zeroinitializer
-; CHECK-NEXT:    [[TMP1:%.*]] = add <16 x i32> [[TMP0]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i64 [[VAL:%.*]] to i32
+; CHECK-NEXT:    [[TMP1:%.*]] = insertelement <16 x i32> undef, i32 [[TMP0]], i64 0
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i32> [[TMP1]], <16 x i32> poison, <16 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP3:%.*]] = add <16 x i32> [[TMP2]], <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
 ; CHECK-NEXT:    br label [[LOOP:%.*]]
 ; CHECK:       loop:
-; CHECK-NEXT:    [[TMP2:%.*]] = phi <16 x i32> [ [[TMP1]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[ELT:%.*]] = extractelement <16 x i32> [[TMP2]], i64 0
-; CHECK-NEXT:    [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP2]], i64 1
+; CHECK-NEXT:    [[TMP4:%.*]] = phi <16 x i32> [ [[TMP3]], [[ENTRY:%.*]] ], [ [[INC:%.*]], [[LOOP]] ]
+; CHECK-NEXT:    [[ELT:%.*]] = extractelement <16 x i32> [[TMP4]], i64 0
+; CHECK-NEXT:    [[ELTCOPY:%.*]] = extractelement <16 x i32> [[TMP4]], i64 1
 ; CHECK-NEXT:    [[END:%.*]] = icmp ult i32 [[ELT]], [[LIMIT:%.*]]
-; CHECK-NEXT:    [[TMP3:%.*]] = add i32 [[ELTCOPY]], 10
-; CHECK-NEXT:    [[TMP4:%.*]] = sext i32 [[ELT]] to i64
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP4]]
-; CHECK-NEXT:    store i32 [[TMP3]], ptr [[TMP5]], align 4
-; CHECK-NEXT:    [[INC]] = add <16 x i32> [[TMP2]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
+; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[ELTCOPY]], 10
+; CHECK-NEXT:    [[TMP6:%.*]] = sext i32 [[ELT]] to i64
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[PTR:%.*]], i64 [[TMP6]]
+; CHECK-NEXT:    store i32 [[TMP5]], ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INC]] = add <16 x i32> [[TMP4]], <i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16, i32 16>
 ; CHECK-NEXT:    br i1 [[END]], label [[LOOP]], label [[RET:%.*]]
 ; CHECK:       ret:
 ; CHECK-NEXT:    ret void

diff  --git a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
index 86a5b97d22686..d394fda439583 100644
--- a/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
+++ b/llvm/test/Transforms/LoopVectorize/ARM/pointer_iv.ll
@@ -155,21 +155,22 @@ end:
 define hidden void @pointer_phi_v8i16_add1(i16* noalias nocapture readonly %A, i16* noalias nocapture %B, i32 %y) {
 ; CHECK-LABEL: @pointer_phi_v8i16_add1(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[VEC_Y:%.*]] = bitcast i32 [[Y:%.*]] to <2 x i16>
-; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <2 x i16> [[VEC_Y]], <2 x i16> poison, <8 x i32> zeroinitializer
+; CHECK-NEXT:    [[TMP0:%.*]] = trunc i32 [[Y:%.*]] to i16
+; CHECK-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP0]], i64 0
+; CHECK-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i16, i16* [[A:%.*]], i32 [[INDEX]]
 ; CHECK-NEXT:    [[NEXT_GEP4:%.*]] = getelementptr i16, i16* [[B:%.*]], i32 [[INDEX]]
-; CHECK-NEXT:    [[TMP0:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP0]], align 2
-; CHECK-NEXT:    [[TMP1:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
-; CHECK-NEXT:    [[TMP2:%.*]] = bitcast i16* [[NEXT_GEP4]] to <8 x i16>*
-; CHECK-NEXT:    store <8 x i16> [[TMP1]], <8 x i16>* [[TMP2]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = bitcast i16* [[NEXT_GEP]] to <8 x i16>*
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <8 x i16>, <8 x i16>* [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add <8 x i16> [[WIDE_LOAD]], [[BROADCAST_SPLAT]]
+; CHECK-NEXT:    [[TMP3:%.*]] = bitcast i16* [[NEXT_GEP4]] to <8 x i16>*
+; CHECK-NEXT:    store <8 x i16> [[TMP2]], <8 x i16>* [[TMP3]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
-; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
-; CHECK-NEXT:    br i1 [[TMP3]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK-NEXT:    [[TMP4:%.*]] = icmp eq i32 [[INDEX_NEXT]], 1000
+; CHECK-NEXT:    br i1 [[TMP4]], label [[END:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP7:![0-9]+]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret void
 ;

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 63b90ac6dc429..1ded4db187841 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -50,15 +50,15 @@ define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 de
 ; SSE-NEXT:    [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16
 ; SSE-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8
 ; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8
-; SSE-NEXT:    [[VEC_V_VAL20:%.*]] = bitcast i64 [[V_VAL20]] to <2 x i32>
-; SSE-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[VEC_V_VAL20]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP2:%.*]] = lshr i64 [[V_VAL20]], 32
-; SSE-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-; SSE-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP3]], i64 1
-; SSE-NEXT:    [[TMP5:%.*]] = trunc i64 [[V_VAL421]] to i32
-; SSE-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i64 2
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP5]], i64 3
-; SSE-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP7]] to <4 x float>
+; SSE-NEXT:    [[TMP1:%.*]] = lshr i64 [[V_VAL20]], 32
+; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[V_VAL20]], i64 0
+; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP1]], i64 1
+; SSE-NEXT:    [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
+; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+; SSE-NEXT:    [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32
+; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2
+; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3
+; SSE-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
 ; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByVal(
@@ -66,15 +66,15 @@ define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 de
 ; AVX-NEXT:    [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16
 ; AVX-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8
 ; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8
-; AVX-NEXT:    [[VEC_V_VAL20:%.*]] = bitcast i64 [[V_VAL20]] to <2 x i32>
-; AVX-NEXT:    [[TMP1:%.*]] = shufflevector <2 x i32> [[VEC_V_VAL20]], <2 x i32> poison, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
-; AVX-NEXT:    [[TMP2:%.*]] = lshr i64 [[V_VAL20]], 32
-; AVX-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-; AVX-NEXT:    [[TMP4:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[TMP3]], i64 1
-; AVX-NEXT:    [[TMP5:%.*]] = trunc i64 [[V_VAL421]] to i32
-; AVX-NEXT:    [[TMP6:%.*]] = insertelement <4 x i32> [[TMP4]], i32 [[TMP5]], i64 2
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP6]], i32 [[TMP5]], i64 3
-; AVX-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP7]] to <4 x float>
+; AVX-NEXT:    [[TMP1:%.*]] = trunc i64 [[V_VAL20]] to i32
+; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i64 0
+; AVX-NEXT:    [[TMP3:%.*]] = lshr i64 [[V_VAL20]], 32
+; AVX-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
+; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i64 1
+; AVX-NEXT:    [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32
+; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2
+; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3
+; AVX-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
 ; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 entry: