[llvm] 4446f71 - [InstCombine] try to fold a pair of insertelements into one insertelement

Mon Dec 12 07:55:41 PST 2022

Author: Sanjay Patel
Date: 2022-12-12T10:39:58-05:00
New Revision: 4446f71ce392a13e64299d4efe9d7c2a2b768c6c

URL: https://github.com/llvm/llvm-project/commit/4446f71ce392a13e64299d4efe9d7c2a2b768c6c
DIFF: https://github.com/llvm/llvm-project/commit/4446f71ce392a13e64299d4efe9d7c2a2b768c6c.diff

LOG: [InstCombine] try to fold a pair of insertelements into one insertelement

This replaces patches that tried to convert related patterns to shuffles
(D138872, D138873, D138874 - reverted/abandoned) but caused codegen
problems and were questionable as a canonicalization because an
insertelement is a simpler op than a shuffle.

This detects a larger pattern -- insert-of-insert -- and replaces with
another insert, so this hopefully does not cause any problems.

As noted by TODO items in the code and tests, this could go a lot further.
But this is enough to reduce the motivating test from issue #17113.

Example proofs:
https://alive2.llvm.org/ce/z/NnUv3a

I drafted a version of this for AggressiveInstCombine, but it seems that
would uncover yet another phase ordering gap. If we do generalize this to
handle the full range of potential patterns, that may be worth looking at
again.

Differential Revision: https://reviews.llvm.org/D139668

Added: 
    

Modified: 
    llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
    llvm/test/Transforms/InstCombine/insertelt-trunc.ll
    llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll

Removed: 
    


################################################################################
diff  --git a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
index bfc45d1627db1..7ce647a9c9db6 100644

--- a/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineVectorOps.cpp
@@ -1515,6 +1515,57 @@ static Instruction *narrowInsElt(InsertElementInst &InsElt,
   return CastInst::Create(CastOpcode, NewInsElt, InsElt.getType());
 }
 
+/// If we are inserting 2 halves of a value into adjacent elements of a vector,
+/// try to convert to a single insert with appropriate bitcasts.
+static Instruction *foldTruncInsEltPair(InsertElementInst &InsElt,
+                                        bool IsBigEndian,
+                                        InstCombiner::BuilderTy &Builder) {
+  Value *VecOp    = InsElt.getOperand(0);
+  Value *ScalarOp = InsElt.getOperand(1);
+  Value *IndexOp  = InsElt.getOperand(2);
+
+  // inselt (inselt BaseVec, (trunc X), Index0), (trunc (lshr X, BW/2)), Index1
+  // TODO: The insertion order could be reversed.
+  // TODO: Detect smaller fractions of the scalar.
+  // TODO: One-use checks are conservative.
+  auto *VTy = dyn_cast<FixedVectorType>(InsElt.getType());
+  Value *X, *BaseVec;
+  uint64_t ShAmt, Index0, Index1;
+  if (!VTy || (VTy->getNumElements() & 1) ||
+      !match(VecOp, m_OneUse(m_InsertElt(m_Value(BaseVec), m_Trunc(m_Value(X)),
+                                         m_ConstantInt(Index0)))) ||
+      !match(ScalarOp, m_OneUse(m_Trunc(m_LShr(m_Specific(X),
+                                               m_ConstantInt(ShAmt))))) ||
+      !match(IndexOp, m_ConstantInt(Index1)))
+    return nullptr;
+
+  Type *SrcTy = X->getType();
+  unsigned ScalarWidth = SrcTy->getScalarSizeInBits();
+  unsigned VecEltWidth = VTy->getScalarSizeInBits();
+  if (ScalarWidth != VecEltWidth * 2 || ShAmt != VecEltWidth)
+    return nullptr;
+
+  // The low half must be inserted at element +1 for big-endian.
+  // The high half must be inserted at element +1 for little-endian
+  if (IsBigEndian ? Index0 != Index1 + 1 : Index0 + 1 != Index1)
+    return nullptr;
+
+  // The high half must be inserted at an even element for big-endian.
+  // The low half must be inserted at an even element for little-endian.
+  if (IsBigEndian ? Index1 & 1 : Index0 & 1)
+    return nullptr;
+
+  // Bitcast the base vector to a vector type with the source element type.
+  Type *CastTy = FixedVectorType::get(SrcTy, VTy->getNumElements() / 2);
+  Value *CastBaseVec = Builder.CreateBitCast(BaseVec, CastTy);
+
+  // Scale the insert index for a vector with half as many elements.
+  // bitcast (inselt (bitcast BaseVec), X, NewIndex)
+  uint64_t NewIndex = IsBigEndian ? Index1 / 2 : Index0 / 2;
+  Value *NewInsert = Builder.CreateInsertElement(CastBaseVec, X, NewIndex);
+  return new BitCastInst(NewInsert, VTy);
+}
+
 Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   Value *VecOp    = IE.getOperand(0);
   Value *ScalarOp = IE.getOperand(1);
@@ -1642,6 +1693,9 @@ Instruction *InstCombinerImpl::visitInsertElementInst(InsertElementInst &IE) {
   if (Instruction *Ext = narrowInsElt(IE, Builder))
     return Ext;
 
+  if (Instruction *Ext = foldTruncInsEltPair(IE, DL.isBigEndian(), Builder))
+    return Ext;
+
   return nullptr;
 }
 

diff  --git a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
index 12b944711f704..bd829d5526d1a 100644
--- a/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
+++ b/llvm/test/Transforms/InstCombine/insertelt-trunc.ll
@@ -1,16 +1,24 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL
-; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL
+; RUN: opt < %s -passes=instcombine -S -data-layout="E" | FileCheck %s --check-prefixes=ALL,BE
+; RUN: opt < %s -passes=instcombine -S -data-layout="e" | FileCheck %s --check-prefixes=ALL,LE
 
 
+declare void @use(i16)
+declare void @use_vec(<8 x i16>)
+
 define <4 x i16> @insert_01_poison_v4i16(i32 %x) {
-; ALL-LABEL: @insert_01_poison_v4i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
-; ALL-NEXT:    ret <4 x i16> [[INS1]]
+; BE-LABEL: @insert_01_poison_v4i16(
+; BE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; BE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; BE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; BE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; BE-NEXT:    ret <4 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_01_poison_v4i16(
+; LE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 0
+; LE-NEXT:    [[INS1:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16>
+; LE-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -21,13 +29,18 @@ define <4 x i16> @insert_01_poison_v4i16(i32 %x) {
 }
 
 define <8 x i16> @insert_10_poison_v8i16(i32 %x) {
-; ALL-LABEL: @insert_10_poison_v8i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
-; ALL-NEXT:    ret <8 x i16> [[INS1]]
+; BE-LABEL: @insert_10_poison_v8i16(
+; BE-NEXT:    [[TMP1:%.*]] = insertelement <4 x i32> poison, i32 [[X:%.*]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = bitcast <4 x i32> [[TMP1]] to <8 x i16>
+; BE-NEXT:    ret <8 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_10_poison_v8i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> poison, i16 [[LO16]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
+; LE-NEXT:    ret <8 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -37,6 +50,8 @@ define <8 x i16> @insert_10_poison_v8i16(i32 %x) {
   ret <8 x i16> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i32> @insert_12_poison_v4i32(i64 %x) {
 ; ALL-LABEL: @insert_12_poison_v4i32(
 ; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
@@ -54,6 +69,8 @@ define <4 x i32> @insert_12_poison_v4i32(i64 %x) {
   ret <4 x i32> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i16> @insert_21_poison_v4i16(i32 %x) {
 ; ALL-LABEL: @insert_21_poison_v4i16(
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
@@ -72,13 +89,18 @@ define <4 x i16> @insert_21_poison_v4i16(i32 %x) {
 }
 
 define <4 x i32> @insert_23_poison_v4i32(i64 %x) {
-; ALL-LABEL: @insert_23_poison_v4i32(
-; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
-; ALL-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
-; ALL-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> poison, i32 [[LO32]], i64 2
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
-; ALL-NEXT:    ret <4 x i32> [[INS1]]
+; BE-LABEL: @insert_23_poison_v4i32(
+; BE-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
+; BE-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
+; BE-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
+; BE-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> poison, i32 [[LO32]], i64 2
+; BE-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
+; BE-NEXT:    ret <4 x i32> [[INS1]]
+;
+; LE-LABEL: @insert_23_poison_v4i32(
+; LE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i64> poison, i64 [[X:%.*]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = bitcast <2 x i64> [[TMP1]] to <4 x i32>
+; LE-NEXT:    ret <4 x i32> [[INS1]]
 ;
   %hi64 = lshr i64 %x, 32
   %hi32 = trunc i64 %hi64 to i32
@@ -89,13 +111,18 @@ define <4 x i32> @insert_23_poison_v4i32(i64 %x) {
 }
 
 define <4 x i16> @insert_32_poison_v4i16(i32 %x) {
-; ALL-LABEL: @insert_32_poison_v4i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
-; ALL-NEXT:    ret <4 x i16> [[INS1]]
+; BE-LABEL: @insert_32_poison_v4i16(
+; BE-NEXT:    [[TMP1:%.*]] = insertelement <2 x i32> poison, i32 [[X:%.*]], i64 1
+; BE-NEXT:    [[INS1:%.*]] = bitcast <2 x i32> [[TMP1]] to <4 x i16>
+; BE-NEXT:    ret <4 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_32_poison_v4i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 3
+; LE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
+; LE-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -105,14 +132,23 @@ define <4 x i16> @insert_32_poison_v4i16(i32 %x) {
   ret <4 x i16> %ins1
 }
 
+; Similar to the above tests but with a non-poison base vector.
+
+; Vector is same size as scalar, so this is just a cast.
+; TODO: Could be swapped/rotated into place.
+
 define <2 x i16> @insert_01_v2i16(i32 %x, <2 x i16> %v) {
-; ALL-LABEL: @insert_01_v2i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <2 x i16> poison, i16 [[LO16]], i64 0
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <2 x i16> [[INS0]], i16 [[HI16]], i64 1
-; ALL-NEXT:    ret <2 x i16> [[INS1]]
+; BE-LABEL: @insert_01_v2i16(
+; BE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; BE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; BE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; BE-NEXT:    [[INS0:%.*]] = insertelement <2 x i16> poison, i16 [[LO16]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = insertelement <2 x i16> [[INS0]], i16 [[HI16]], i64 1
+; BE-NEXT:    ret <2 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_01_v2i16(
+; LE-NEXT:    [[INS1:%.*]] = bitcast i32 [[X:%.*]] to <2 x i16>
+; LE-NEXT:    ret <2 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -123,13 +159,19 @@ define <2 x i16> @insert_01_v2i16(i32 %x, <2 x i16> %v) {
 }
 
 define <8 x i16> @insert_10_v8i16(i32 %x, <8 x i16> %v) {
-; ALL-LABEL: @insert_10_v8i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
-; ALL-NEXT:    ret <8 x i16> [[INS1]]
+; BE-LABEL: @insert_10_v8i16(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32>
+; BE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X:%.*]], i64 0
+; BE-NEXT:    [[INS1:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; BE-NEXT:    ret <8 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_10_v8i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 0
+; LE-NEXT:    ret <8 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -139,6 +181,8 @@ define <8 x i16> @insert_10_v8i16(i32 %x, <8 x i16> %v) {
   ret <8 x i16> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i32> @insert_12_v4i32(i64 %x, <4 x i32> %v) {
 ; ALL-LABEL: @insert_12_v4i32(
 ; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
@@ -156,6 +200,8 @@ define <4 x i32> @insert_12_v4i32(i64 %x, <4 x i32> %v) {
   ret <4 x i32> %ins1
 }
 
+; negative test - larger element is not aligned in the vector
+
 define <4 x i16> @insert_21_v4i16(i32 %x, <4 x i16> %v) {
 ; ALL-LABEL: @insert_21_v4i16(
 ; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
@@ -174,13 +220,19 @@ define <4 x i16> @insert_21_v4i16(i32 %x, <4 x i16> %v) {
 }
 
 define <4 x i32> @insert_23_v4i32(i64 %x, <4 x i32> %v) {
-; ALL-LABEL: @insert_23_v4i32(
-; ALL-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
-; ALL-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
-; ALL-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> [[V:%.*]], i32 [[LO32]], i64 2
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
-; ALL-NEXT:    ret <4 x i32> [[INS1]]
+; BE-LABEL: @insert_23_v4i32(
+; BE-NEXT:    [[HI64:%.*]] = lshr i64 [[X:%.*]], 32
+; BE-NEXT:    [[HI32:%.*]] = trunc i64 [[HI64]] to i32
+; BE-NEXT:    [[LO32:%.*]] = trunc i64 [[X]] to i32
+; BE-NEXT:    [[INS0:%.*]] = insertelement <4 x i32> [[V:%.*]], i32 [[LO32]], i64 2
+; BE-NEXT:    [[INS1:%.*]] = insertelement <4 x i32> [[INS0]], i32 [[HI32]], i64 3
+; BE-NEXT:    ret <4 x i32> [[INS1]]
+;
+; LE-LABEL: @insert_23_v4i32(
+; LE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i32> [[V:%.*]] to <2 x i64>
+; LE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> [[TMP1]], i64 [[X:%.*]], i64 1
+; LE-NEXT:    [[INS1:%.*]] = bitcast <2 x i64> [[TMP2]] to <4 x i32>
+; LE-NEXT:    ret <4 x i32> [[INS1]]
 ;
   %hi64 = lshr i64 %x, 32
   %hi32 = trunc i64 %hi64 to i32
@@ -191,13 +243,19 @@ define <4 x i32> @insert_23_v4i32(i64 %x, <4 x i32> %v) {
 }
 
 define <4 x i16> @insert_32_v4i16(i32 %x, <4 x i16> %v) {
-; ALL-LABEL: @insert_32_v4i16(
-; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
-; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
-; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
-; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3
-; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
-; ALL-NEXT:    ret <4 x i16> [[INS1]]
+; BE-LABEL: @insert_32_v4i16(
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <4 x i16> [[V:%.*]] to <2 x i32>
+; BE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i32> [[TMP1]], i32 [[X:%.*]], i64 1
+; BE-NEXT:    [[INS1:%.*]] = bitcast <2 x i32> [[TMP2]] to <4 x i16>
+; BE-NEXT:    ret <4 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_32_v4i16(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[V:%.*]], i16 [[LO16]], i64 3
+; LE-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 2
+; LE-NEXT:    ret <4 x i16> [[INS1]]
 ;
   %hi32 = lshr i32 %x, 16
   %hi16 = trunc i32 %hi32 to i16
@@ -206,3 +264,131 @@ define <4 x i16> @insert_32_v4i16(i32 %x, <4 x i16> %v) {
   %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 2
   ret <4 x i16> %ins1
 }
+
+; negative test - need half-width shift
+
+define <4 x i16> @insert_01_v4i16_wrong_shift1(i32 %x) {
+; ALL-LABEL: @insert_01_v4i16_wrong_shift1(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 8
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; ALL-NEXT:    ret <4 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 8
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  %ins0 = insertelement <4 x i16> poison, i16 %lo16, i64 0
+  %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 1
+  ret <4 x i16> %ins1
+}
+
+; negative test - need common scalar
+
+define <4 x i16> @insert_01_v4i16_wrong_op(i32 %x, i32 %y) {
+; ALL-LABEL: @insert_01_v4i16_wrong_op(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[Y:%.*]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> poison, i16 [[LO16]], i64 0
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> [[INS0]], i16 [[HI16]], i64 1
+; ALL-NEXT:    ret <4 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %y to i16
+  %ins0 = insertelement <4 x i16> poison, i16 %lo16, i64 0
+  %ins1 = insertelement <4 x i16> %ins0, i16 %hi16, i64 1
+  ret <4 x i16> %ins1
+}
+
+; TODO: extra use doesn't have to prevent the fold.
+
+define <8 x i16> @insert_67_v4i16_uses1(i32 %x, <8 x i16> %v) {
+; ALL-LABEL: @insert_67_v4i16_uses1(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    call void @use(i16 [[HI16]])
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 6
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 7
+; ALL-NEXT:    ret <8 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  call void @use(i16 %hi16)
+  %lo16 = trunc i32 %x to i16
+  %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 6
+  %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 7
+  ret <8 x i16> %ins1
+}
+
+; extra use is ok
+
+define <8 x i16> @insert_76_v4i16_uses2(i32 %x, <8 x i16> %v) {
+; BE-LABEL: @insert_76_v4i16_uses2(
+; BE-NEXT:    [[LO16:%.*]] = trunc i32 [[X:%.*]] to i16
+; BE-NEXT:    call void @use(i16 [[LO16]])
+; BE-NEXT:    [[TMP1:%.*]] = bitcast <8 x i16> [[V:%.*]] to <4 x i32>
+; BE-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> [[TMP1]], i32 [[X]], i64 3
+; BE-NEXT:    [[INS1:%.*]] = bitcast <4 x i32> [[TMP2]] to <8 x i16>
+; BE-NEXT:    ret <8 x i16> [[INS1]]
+;
+; LE-LABEL: @insert_76_v4i16_uses2(
+; LE-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; LE-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; LE-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; LE-NEXT:    call void @use(i16 [[LO16]])
+; LE-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 7
+; LE-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 6
+; LE-NEXT:    ret <8 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  call void @use(i16 %lo16)
+  %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 7
+  %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 6
+  ret <8 x i16> %ins1
+}
+
+; TODO: extra use doesn't have to prevent the fold.
+
+define <8 x i16> @insert_67_v4i16_uses3(i32 %x, <8 x i16> %v) {
+; ALL-LABEL: @insert_67_v4i16_uses3(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <8 x i16> [[V:%.*]], i16 [[LO16]], i64 6
+; ALL-NEXT:    call void @use_vec(<8 x i16> [[INS0]])
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <8 x i16> [[INS0]], i16 [[HI16]], i64 7
+; ALL-NEXT:    ret <8 x i16> [[INS1]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  %ins0 = insertelement <8 x i16> %v, i16 %lo16, i64 6
+  call void @use_vec(<8 x i16> %ins0)
+  %ins1 = insertelement <8 x i16> %ins0, i16 %hi16, i64 7
+  ret <8 x i16> %ins1
+}
+
+; TODO: This is equivalent to the 1st test.
+
+define <4 x i16> @insert_01_poison_v4i16_high_first(i32 %x) {
+; ALL-LABEL: @insert_01_poison_v4i16_high_first(
+; ALL-NEXT:    [[HI32:%.*]] = lshr i32 [[X:%.*]], 16
+; ALL-NEXT:    [[HI16:%.*]] = trunc i32 [[HI32]] to i16
+; ALL-NEXT:    [[LO16:%.*]] = trunc i32 [[X]] to i16
+; ALL-NEXT:    [[INS1:%.*]] = insertelement <4 x i16> poison, i16 [[HI16]], i64 1
+; ALL-NEXT:    [[INS0:%.*]] = insertelement <4 x i16> [[INS1]], i16 [[LO16]], i64 0
+; ALL-NEXT:    ret <4 x i16> [[INS0]]
+;
+  %hi32 = lshr i32 %x, 16
+  %hi16 = trunc i32 %hi32 to i16
+  %lo16 = trunc i32 %x to i16
+  %ins1 = insertelement <4 x i16> poison, i16 %hi16, i64 1
+  %ins0 = insertelement <4 x i16> %ins1, i16 %lo16, i64 0
+  ret <4 x i16> %ins0
+}

diff  --git a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
index 1ded4db187841..77cbc70ff3697 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/vec-load-combine.ll
@@ -47,34 +47,24 @@ define dso_local noundef <4 x float> @ConvertVectors_ByRef(ptr noundef nonnull a
 define noundef <4 x float> @ConvertVectors_ByVal(ptr noundef nonnull align 16 dereferenceable(16) %V) #0 {
 ; SSE-LABEL: @ConvertVectors_ByVal(
 ; SSE-NEXT:  entry:
-; SSE-NEXT:    [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16
-; SSE-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8
-; SSE-NEXT:    [[TMP1:%.*]] = lshr i64 [[V_VAL20]], 32
-; SSE-NEXT:    [[TMP2:%.*]] = insertelement <2 x i64> poison, i64 [[V_VAL20]], i64 0
-; SSE-NEXT:    [[TMP3:%.*]] = insertelement <2 x i64> [[TMP2]], i64 [[TMP1]], i64 1
-; SSE-NEXT:    [[TMP4:%.*]] = trunc <2 x i64> [[TMP3]] to <2 x i32>
-; SSE-NEXT:    [[TMP5:%.*]] = shufflevector <2 x i32> [[TMP4]], <2 x i32> poison, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
-; SSE-NEXT:    [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32
-; SSE-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2
-; SSE-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3
-; SSE-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
+; SSE-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
+; SSE-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
+; SSE-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
+; SSE-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
+; SSE-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; SSE-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
+; SSE-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
 ; SSE-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 ; AVX-LABEL: @ConvertVectors_ByVal(
 ; AVX-NEXT:  entry:
-; AVX-NEXT:    [[V_VAL20:%.*]] = load i64, ptr [[V:%.*]], align 16
-; AVX-NEXT:    [[TMP0:%.*]] = getelementptr i8, ptr [[V]], i64 8
-; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP0]], align 8
-; AVX-NEXT:    [[TMP1:%.*]] = trunc i64 [[V_VAL20]] to i32
-; AVX-NEXT:    [[TMP2:%.*]] = insertelement <4 x i32> undef, i32 [[TMP1]], i64 0
-; AVX-NEXT:    [[TMP3:%.*]] = lshr i64 [[V_VAL20]], 32
-; AVX-NEXT:    [[TMP4:%.*]] = trunc i64 [[TMP3]] to i32
-; AVX-NEXT:    [[TMP5:%.*]] = insertelement <4 x i32> [[TMP2]], i32 [[TMP4]], i64 1
-; AVX-NEXT:    [[TMP6:%.*]] = trunc i64 [[V_VAL421]] to i32
-; AVX-NEXT:    [[TMP7:%.*]] = insertelement <4 x i32> [[TMP5]], i32 [[TMP6]], i64 2
-; AVX-NEXT:    [[TMP8:%.*]] = insertelement <4 x i32> [[TMP7]], i32 [[TMP6]], i64 3
-; AVX-NEXT:    [[VECINIT16:%.*]] = bitcast <4 x i32> [[TMP8]] to <4 x float>
+; AVX-NEXT:    [[TMP0:%.*]] = load <4 x float>, ptr [[V:%.*]], align 16
+; AVX-NEXT:    [[TMP1:%.*]] = getelementptr i8, ptr [[V]], i64 8
+; AVX-NEXT:    [[V_VAL421:%.*]] = load i64, ptr [[TMP1]], align 8
+; AVX-NEXT:    [[TMP2:%.*]] = trunc i64 [[V_VAL421]] to i32
+; AVX-NEXT:    [[TMP3:%.*]] = bitcast i32 [[TMP2]] to float
+; AVX-NEXT:    [[VECINIT11:%.*]] = insertelement <4 x float> [[TMP0]], float [[TMP3]], i64 2
+; AVX-NEXT:    [[VECINIT16:%.*]] = insertelement <4 x float> [[VECINIT11]], float [[TMP3]], i64 3
 ; AVX-NEXT:    ret <4 x float> [[VECINIT16]]
 ;
 entry: