[llvm] [VectorCombine] Support simplification to scalar store for multiple insertelt (PR #132820)

via llvm-commits llvm-commits at lists.llvm.org
Tue Mar 25 04:00:22 PDT 2025


https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/132820

>From 288dd9f4a23221933b0ca2cf6413593bda63739a Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Mon, 24 Mar 2025 14:47:00 +0900
Subject: [PATCH 1/2] add testcases for upcoming patch

---
 .../VectorCombine/load-insert-store.ll        | 318 ++++++++++++++++++
 1 file changed, 318 insertions(+)

diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
index 93565c1a708eb..9dcadb5ccf30a 100644
--- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll
+++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
@@ -16,6 +16,71 @@ entry:
   ret void
 }
 
+define void @insert_store2(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 6
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 7
+; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, ptr %q
+  %vec1 = insertelement <8 x i16> %0, i16 %s, i32 6
+  %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 7
+  store <8 x i16> %vec2, ptr %q, align 1
+  ret void
+}
+
+define void @insert_store3(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store3(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 5
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 6
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[S]], i32 7
+; CHECK-NEXT:    store <8 x i16> [[VEC3]], ptr [[Q]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, ptr %q
+  %vec1 = insertelement <8 x i16> %0, i16 %s, i32 5
+  %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 6
+  %vec3 = insertelement <8 x i16> %vec2, i16 %s, i32 7
+  store <8 x i16> %vec3, ptr %q, align 1
+  ret void
+}
+
+define void @insert_store8(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 0
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 1
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[S]], i32 2
+; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[S]], i32 3
+; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[S]], i32 4
+; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[S]], i32 5
+; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[S]], i32 6
+; CHECK-NEXT:    [[VEC8:%.*]] = insertelement <8 x i16> [[VEC7]], i16 [[S]], i32 7
+; CHECK-NEXT:    store <8 x i16> [[VEC8]], ptr [[Q]], align 1
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, ptr %q
+  %vec1 = insertelement <8 x i16> %0, i16 %s, i32 0
+  %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 1
+  %vec3 = insertelement <8 x i16> %vec2, i16 %s, i32 2
+  %vec4 = insertelement <8 x i16> %vec3, i16 %s, i32 3
+  %vec5 = insertelement <8 x i16> %vec4, i16 %s, i32 4
+  %vec6 = insertelement <8 x i16> %vec5, i16 %s, i32 5
+  %vec7 = insertelement <8 x i16> %vec6, i16 %s, i32 6
+  %vec8 = insertelement <8 x i16> %vec7, i16 %s, i32 7
+  store <8 x i16> %vec8, ptr %q, align 1
+  ret void
+}
+
 define void @insert_store_i16_align1(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store_i16_align1(
 ; CHECK-NEXT:  entry:
@@ -827,3 +892,256 @@ bb:
 
 declare i32 @bar(i32, i1) readonly
 declare double @llvm.log2.f64(double)
+
+define void @insert_store_gap(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_gap(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 5
+; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, ptr %q
+  %vec1 = insertelement <8 x i16> %0, i16 %s, i32 2
+  %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 5
+  store <8 x i16> %vec2, ptr %q
+  ret void
+}
+
+define void @insert_store_reverse(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_reverse(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 7
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 6
+; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[S]], i32 5
+; CHECK-NEXT:    store <8 x i16> [[VEC3]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, ptr %q
+  %vec1 = insertelement <8 x i16> %0, i16 %s, i32 7
+  %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 6
+  %vec3 = insertelement <8 x i16> %vec2, i16 %s, i32 5
+  store <8 x i16> %vec3, ptr %q
+  ret void
+}
+
+define void @insert_store_duplicate(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_duplicate(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 3
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 3
+; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, ptr %q
+  %vec1 = insertelement <8 x i16> %0, i16 %s, i32 3
+  %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 3
+  store <8 x i16> %vec2, ptr %q
+  ret void
+}
+
+define void @insert_store_i32(ptr %q, i32 zeroext %s) {
+; CHECK-LABEL: @insert_store_i32(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[S:%.*]], i32 2
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[S]], i32 3
+; CHECK-NEXT:    store <4 x i32> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <4 x i32>, ptr %q
+  %vec1 = insertelement <4 x i32> %0, i32 %s, i32 2
+  %vec2 = insertelement <4 x i32> %vec1, i32 %s, i32 3
+  store <4 x i32> %vec2, ptr %q
+  ret void
+}
+
+define void @insert_store_i8(ptr %q, i8 zeroext %s) {
+; CHECK-LABEL: @insert_store_i8(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 8
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <16 x i8> [[VEC1]], i8 [[S]], i32 9
+; CHECK-NEXT:    store <16 x i8> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <16 x i8>, ptr %q
+  %vec1 = insertelement <16 x i8> %0, i8 %s, i32 8
+  %vec2 = insertelement <16 x i8> %vec1, i8 %s, i32 9
+  store <16 x i8> %vec2, ptr %q
+  ret void
+}
+
+define void @insert_store_alignment(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_alignment(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 0
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 4
+; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <8 x i16>, ptr %q, align 16
+  %vec1 = insertelement <8 x i16> %0, i16 %s, i32 0
+  %vec2 = insertelement <8 x i16> %vec1, i16 %s, i32 4
+  store <8 x i16> %vec2, ptr %q, align 16
+  ret void
+}
+
+define void @insert_store_size(ptr %q, i16 zeroext %s) {
+; CHECK-LABEL: @insert_store_size(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[Q:%.*]], align 32
+; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <16 x i16> [[TMP0]], i16 [[S:%.*]], i32 8
+; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <16 x i16> [[VEC1]], i16 [[S]], i32 12
+; CHECK-NEXT:    store <16 x i16> [[VEC2]], ptr [[Q]], align 32
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <16 x i16>, ptr %q
+  %vec1 = insertelement <16 x i16> %0, i16 %s, i32 8
+  %vec2 = insertelement <16 x i16> %vec1, i16 %s, i32 12
+  store <16 x i16> %vec2, ptr %q
+  ret void
+}
+
+define void @insert_store_nonconst4(ptr %q, i8 zeroext %s, i32 %idx1, i32 %idx2, i32 %idx3, i32 %idx4) {
+; CHECK-LABEL: @insert_store_nonconst4(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX1:%.*]]
+; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <16 x i8> [[VECINS1]], i8 [[S]], i32 [[IDX2:%.*]]
+; CHECK-NEXT:    [[VECINS3:%.*]] = insertelement <16 x i8> [[VECINS2]], i8 [[S]], i32 [[IDX3:%.*]]
+; CHECK-NEXT:    [[VECINS4:%.*]] = insertelement <16 x i8> [[VECINS3]], i8 [[S]], i32 [[IDX4:%.*]]
+; CHECK-NEXT:    store <16 x i8> [[VECINS4]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <16 x i8>, ptr %q
+  %vecins1 = insertelement <16 x i8> %0, i8 %s, i32 %idx1
+  %vecins2 = insertelement <16 x i8> %vecins1, i8 %s, i32 %idx2
+  %vecins3 = insertelement <16 x i8> %vecins2, i8 %s, i32 %idx3
+  %vecins4 = insertelement <16 x i8> %vecins3, i8 %s, i32 %idx4
+  store <16 x i8> %vecins4, ptr %q
+  ret void
+}
+
+define void @insert_store_vscale_nonconst2(ptr %q, i8 zeroext %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_vscale_nonconst2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[Q:%.*]], align 16
+; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <vscale x 16 x i8> [[TMP0]], i8 [[S:%.*]], i32 [[IDX1:%.*]]
+; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <vscale x 16 x i8> [[VECINS1]], i8 [[S]], i32 [[IDX2:%.*]]
+; CHECK-NEXT:    store <vscale x 16 x i8> [[VECINS2]], ptr [[Q]], align 16
+; CHECK-NEXT:    ret void
+;
+entry:
+  %0 = load <vscale x 16 x i8>, ptr %q
+  %vecins1 = insertelement <vscale x 16 x i8> %0, i8 %s, i32 %idx1
+  %vecins2 = insertelement <vscale x 16 x i8> %vecins1, i8 %s, i32 %idx2
+  store <vscale x 16 x i8> %vecins2, ptr %q
+  ret void
+}
+
+define void @insert_store_nonconst_large_alignment2(ptr %q, i32 zeroext %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_large_alignment2(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 4
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 4
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    [[I:%.*]] = load <4 x i32>, ptr [[Q:%.*]], align 128
+; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <4 x i32> [[I]], i32 [[S:%.*]], i32 [[IDX1]]
+; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <4 x i32> [[VECINS1]], i32 [[S]], i32 [[IDX2]]
+; CHECK-NEXT:    store <4 x i32> [[VECINS2]], ptr [[Q]], align 128
+; CHECK-NEXT:    ret void
+;
+entry:
+  %cmp1 = icmp ult i32 %idx1, 4
+  %cmp2 = icmp ult i32 %idx2, 4
+  call void @llvm.assume(i1 %cmp1)
+  call void @llvm.assume(i1 %cmp2)
+  %i = load <4 x i32>, ptr %q, align 128
+  %vecins1 = insertelement <4 x i32> %i, i32 %s, i32 %idx1
+  %vecins2 = insertelement <4 x i32> %vecins1, i32 %s, i32 %idx2
+  store <4 x i32> %vecins2, ptr %q, align 128
+  ret void
+}
+
+define void @insert_store_nonconst_align_maximum_8_2(ptr %q, i64 %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_align_maximum_8_2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 2
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    [[I:%.*]] = load <8 x i64>, ptr [[Q:%.*]], align 8
+; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <8 x i64> [[I]], i64 [[S:%.*]], i32 [[IDX1]]
+; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <8 x i64> [[VECINS1]], i64 [[S]], i32 [[IDX2]]
+; CHECK-NEXT:    store <8 x i64> [[VECINS2]], ptr [[Q]], align 8
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp ult i32 %idx1, 2
+  %cmp2 = icmp ult i32 %idx2, 2
+  call void @llvm.assume(i1 %cmp1)
+  call void @llvm.assume(i1 %cmp2)
+  %i = load <8 x i64>, ptr %q, align 8
+  %vecins1 = insertelement <8 x i64> %i, i64 %s, i32 %idx1
+  %vecins2 = insertelement <8 x i64> %vecins1, i64 %s, i32 %idx2
+  store <8 x i64> %vecins2, ptr %q, align 8
+  ret void
+}
+
+define void @insert_store_nonconst_align_maximum_4_2(ptr %q, i64 %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_align_maximum_4_2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 2
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    [[I:%.*]] = load <8 x i64>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <8 x i64> [[I]], i64 [[S:%.*]], i32 [[IDX1]]
+; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <8 x i64> [[VECINS1]], i64 [[S]], i32 [[IDX2]]
+; CHECK-NEXT:    store <8 x i64> [[VECINS2]], ptr [[Q]], align 4
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp ult i32 %idx1, 2
+  %cmp2 = icmp ult i32 %idx2, 2
+  call void @llvm.assume(i1 %cmp1)
+  call void @llvm.assume(i1 %cmp2)
+  %i = load <8 x i64>, ptr %q, align 4
+  %vecins1 = insertelement <8 x i64> %i, i64 %s, i32 %idx1
+  %vecins2 = insertelement <8 x i64> %vecins1, i64 %s, i32 %idx2
+  store <8 x i64> %vecins2, ptr %q, align 4
+  ret void
+}
+
+define void @insert_store_nonconst_align_larger_2(ptr %q, i64 %s, i32 %idx1, i32 %idx2) {
+; CHECK-LABEL: @insert_store_nonconst_align_larger_2(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i32 [[IDX1:%.*]], 2
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
+; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
+; CHECK-NEXT:    [[I:%.*]] = load <8 x i64>, ptr [[Q:%.*]], align 4
+; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <8 x i64> [[I]], i64 [[S:%.*]], i32 [[IDX1]]
+; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <8 x i64> [[VECINS1]], i64 [[S]], i32 [[IDX2]]
+; CHECK-NEXT:    store <8 x i64> [[VECINS2]], ptr [[Q]], align 2
+; CHECK-NEXT:    ret void
+;
+  %cmp1 = icmp ult i32 %idx1, 2
+  %cmp2 = icmp ult i32 %idx2, 2
+  call void @llvm.assume(i1 %cmp1)
+  call void @llvm.assume(i1 %cmp2)
+  %i = load <8 x i64>, ptr %q, align 4
+  %vecins1 = insertelement <8 x i64> %i, i64 %s, i32 %idx1
+  %vecins2 = insertelement <8 x i64> %vecins1, i64 %s, i32 %idx2
+  store <8 x i64> %vecins2, ptr %q, align 2
+  ret void
+}

>From e589ed008dd3085221dc9d40627640cc5705c21c Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Mon, 24 Mar 2025 15:59:37 +0900
Subject: [PATCH 2/2] [VectorCombine] Support simplification to scalar store
 for multiple insertelt

Previously, we supported simplifying load-insertelt-store to getelementptr-store
when only one insertelt exists.

This patch supports multiple insertelements.

Proof: https://alive2.llvm.org/ce/z/QTspTf
---
 .../Transforms/Vectorize/VectorCombine.cpp    | 102 ++++++++-----
 .../VectorCombine/load-insert-store.ll        | 136 +++++++++---------
 2 files changed, 138 insertions(+), 100 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 4bfe41a5ed00d..483a344d33fb7 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -115,7 +115,7 @@ class VectorCombine {
   bool scalarizeVPIntrinsic(Instruction &I);
   bool foldExtractedCmps(Instruction &I);
   bool foldBinopOfReductions(Instruction &I);
-  bool foldSingleElementStore(Instruction &I);
+  bool foldInsertElementsStore(Instruction &I);
   bool scalarizeLoadExtract(Instruction &I);
   bool foldConcatOfBoolMasks(Instruction &I);
   bool foldPermuteOfBinops(Instruction &I);
@@ -1493,58 +1493,88 @@ static Align computeAlignmentAfterScalarization(Align VectorAlignment,
 //   %0 = bitcast <4 x i32>* %a to i32*
 //   %1 = getelementptr inbounds i32, i32* %0, i64 0, i64 1
 //   store i32 %b, i32* %1
-bool VectorCombine::foldSingleElementStore(Instruction &I) {
+bool VectorCombine::foldInsertElementsStore(Instruction &I) {
   auto *SI = cast<StoreInst>(&I);
   if (!SI->isSimple() || !isa<VectorType>(SI->getValueOperand()->getType()))
     return false;
 
-  // TODO: Combine more complicated patterns (multiple insert) by referencing
-  // TargetTransformInfo.
-  Instruction *Source;
-  Value *NewElement;
-  Value *Idx;
-  if (!match(SI->getValueOperand(),
-             m_InsertElt(m_Instruction(Source), m_Value(NewElement),
-                         m_Value(Idx))))
-    return false;
-
-  if (auto *Load = dyn_cast<LoadInst>(Source)) {
-    auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
-    Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
-    // Don't optimize for atomic/volatile load or store. Ensure memory is not
-    // modified between, vector type matches store size, and index is inbounds.
-    if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
-        !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
-        SrcAddr != SI->getPointerOperand()->stripPointerCasts())
-      return false;
+  Value *Source = SI->getValueOperand();
+  // Track back multiple inserts.
+  SmallVector<std::pair<Value *, Value *>, 4> InsertElements;
+  Value *Base = Source;
+  while (auto *Insert = dyn_cast<InsertElementInst>(Base)) {
+    if (!Insert->hasOneUse())
+      break;
+    Value *InsertVal = Insert->getOperand(1);
+    Value *Idx = Insert->getOperand(2);
+    InsertElements.push_back({InsertVal, Idx});
+    Base = Insert->getOperand(0);
+  }
 
-    auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
-    if (ScalarizableIdx.isUnsafe() ||
-        isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
-                             MemoryLocation::get(SI), AA))
-      return false;
+  if (InsertElements.empty())
+    return false;
 
-    // Ensure we add the load back to the worklist BEFORE its users so they can
-    // erased in the correct order.
-    Worklist.push(Load);
+  auto *Load = dyn_cast<LoadInst>(Base);
+  if (!Load)
+    return false;
 
+  auto VecTy = cast<VectorType>(SI->getValueOperand()->getType());
+  Value *SrcAddr = Load->getPointerOperand()->stripPointerCasts();
+  // Don't optimize for atomic/volatile load or store. Ensure memory is not
+  // modified between, vector type matches store size, and index is inbounds.
+  if (!Load->isSimple() || Load->getParent() != SI->getParent() ||
+      !DL->typeSizeEqualsStoreSize(Load->getType()->getScalarType()) ||
+      SrcAddr != SI->getPointerOperand()->stripPointerCasts())
+    return false;
+
+  if (isMemModifiedBetween(Load->getIterator(), SI->getIterator(),
+                           MemoryLocation::get(SI), AA))
+    return false;
+
+  for (size_t i = 0; i < InsertElements.size(); i++) {
+    Value *Idx = InsertElements[i].second;
+    auto ScalarizableIdx = canScalarizeAccess(VecTy, Idx, Load, AC, DT);
+    if (ScalarizableIdx.isUnsafe())
+      return false;
     if (ScalarizableIdx.isSafeWithFreeze())
       ScalarizableIdx.freeze(Builder, *cast<Instruction>(Idx));
+  }
+
+  // Ensure we add the load back to the worklist BEFORE its users so they can
+  // erased in the correct order.
+  Worklist.push(Load);
+  stable_sort(InsertElements, [](const std::pair<Value *, Value *> &A,
+                                 const std::pair<Value *, Value *> &B) {
+    bool AIsConst = isa<ConstantInt>(A.second);
+    bool BIsConst = isa<ConstantInt>(B.second);
+    if (AIsConst != BIsConst)
+      return AIsConst;
+
+    if (AIsConst && BIsConst)
+      return cast<ConstantInt>(A.second)->getZExtValue() <
+             cast<ConstantInt>(B.second)->getZExtValue();
+    return false;
+  });
+
+  StoreInst *NSI;
+  for (size_t i = 0; i < InsertElements.size(); i++) {
+    Value *InsertVal = InsertElements[i].first;
+    Value *Idx = InsertElements[i].second;
+
     Value *GEP = Builder.CreateInBoundsGEP(
         SI->getValueOperand()->getType(), SI->getPointerOperand(),
         {ConstantInt::get(Idx->getType(), 0), Idx});
-    StoreInst *NSI = Builder.CreateStore(NewElement, GEP);
+    NSI = Builder.CreateStore(InsertVal, GEP);
     NSI->copyMetadata(*SI);
     Align ScalarOpAlignment = computeAlignmentAfterScalarization(
-        std::max(SI->getAlign(), Load->getAlign()), NewElement->getType(), Idx,
+        std::max(SI->getAlign(), Load->getAlign()), InsertVal->getType(), Idx,
         *DL);
     NSI->setAlignment(ScalarOpAlignment);
-    replaceValue(I, *NSI);
-    eraseInstruction(I);
-    return true;
   }
 
-  return false;
+  replaceValue(I, *NSI);
+  eraseInstruction(I);
+  return true;
 }
 
 /// Try to scalarize vector loads feeding extractelement instructions.
@@ -3527,7 +3557,7 @@ bool VectorCombine::run() {
     }
 
     if (Opcode == Instruction::Store)
-      MadeChange |= foldSingleElementStore(I);
+      MadeChange |= foldInsertElementsStore(I);
 
     // If this is an early pipeline invocation of this pass, we are done.
     if (TryEarlyFoldsOnly)
diff --git a/llvm/test/Transforms/VectorCombine/load-insert-store.ll b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
index 9dcadb5ccf30a..33b4562844720 100644
--- a/llvm/test/Transforms/VectorCombine/load-insert-store.ll
+++ b/llvm/test/Transforms/VectorCombine/load-insert-store.ll
@@ -19,10 +19,10 @@ entry:
 define void @insert_store2(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store2(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 6
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 7
-; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 6
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -36,11 +36,12 @@ entry:
 define void @insert_store3(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store3(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 5
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 6
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[S]], i32 7
-; CHECK-NEXT:    store <8 x i16> [[VEC3]], ptr [[Q]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 5
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 6
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -55,16 +56,22 @@ entry:
 define void @insert_store8(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 0
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 1
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[S]], i32 2
-; CHECK-NEXT:    [[VEC4:%.*]] = insertelement <8 x i16> [[VEC3]], i16 [[S]], i32 3
-; CHECK-NEXT:    [[VEC5:%.*]] = insertelement <8 x i16> [[VEC4]], i16 [[S]], i32 4
-; CHECK-NEXT:    [[VEC6:%.*]] = insertelement <8 x i16> [[VEC5]], i16 [[S]], i32 5
-; CHECK-NEXT:    [[VEC7:%.*]] = insertelement <8 x i16> [[VEC6]], i16 [[S]], i32 6
-; CHECK-NEXT:    [[VEC8:%.*]] = insertelement <8 x i16> [[VEC7]], i16 [[S]], i32 7
-; CHECK-NEXT:    store <8 x i16> [[VEC8]], ptr [[Q]], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 1
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 2
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 2
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP2]], align 4
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 3
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP3]], align 2
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 4
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP4]], align 8
+; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 5
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP5]], align 2
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 6
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP6]], align 4
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP7]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -896,10 +903,10 @@ declare double @llvm.log2.f64(double)
 define void @insert_store_gap(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store_gap(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 5
-; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 2
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 5
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -913,11 +920,12 @@ entry:
 define void @insert_store_reverse(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store_reverse(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 7
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 6
-; CHECK-NEXT:    [[VEC3:%.*]] = insertelement <8 x i16> [[VEC2]], i16 [[S]], i32 5
-; CHECK-NEXT:    store <8 x i16> [[VEC3]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 5
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 6
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 7
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP2]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -932,10 +940,10 @@ entry:
 define void @insert_store_duplicate(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store_duplicate(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 3
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 3
-; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 3
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 3
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 2
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -949,10 +957,10 @@ entry:
 define void @insert_store_i32(ptr %q, i32 zeroext %s) {
 ; CHECK-LABEL: @insert_store_i32(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <4 x i32>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <4 x i32> [[TMP0]], i32 [[S:%.*]], i32 2
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <4 x i32> [[VEC1]], i32 [[S]], i32 3
-; CHECK-NEXT:    store <4 x i32> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q:%.*]], i32 0, i32 2
+; CHECK-NEXT:    store i32 [[S:%.*]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q]], i32 0, i32 3
+; CHECK-NEXT:    store i32 [[S]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -966,10 +974,10 @@ entry:
 define void @insert_store_i8(ptr %q, i8 zeroext %s) {
 ; CHECK-LABEL: @insert_store_i8(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i8>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <16 x i8> [[TMP0]], i8 [[S:%.*]], i32 8
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <16 x i8> [[VEC1]], i8 [[S]], i32 9
-; CHECK-NEXT:    store <16 x i8> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i8>, ptr [[Q:%.*]], i32 0, i32 8
+; CHECK-NEXT:    store i8 [[S:%.*]], ptr [[TMP0]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <16 x i8>, ptr [[Q]], i32 0, i32 9
+; CHECK-NEXT:    store i8 [[S]], ptr [[TMP1]], align 1
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -983,10 +991,10 @@ entry:
 define void @insert_store_alignment(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store_alignment(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <8 x i16>, ptr [[Q:%.*]], align 16
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <8 x i16> [[TMP0]], i16 [[S:%.*]], i32 0
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <8 x i16> [[VEC1]], i16 [[S]], i32 4
-; CHECK-NEXT:    store <8 x i16> [[VEC2]], ptr [[Q]], align 16
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q:%.*]], i32 0, i32 0
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i16>, ptr [[Q]], i32 0, i32 4
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1000,10 +1008,10 @@ entry:
 define void @insert_store_size(ptr %q, i16 zeroext %s) {
 ; CHECK-LABEL: @insert_store_size(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x i16>, ptr [[Q:%.*]], align 32
-; CHECK-NEXT:    [[VEC1:%.*]] = insertelement <16 x i16> [[TMP0]], i16 [[S:%.*]], i32 8
-; CHECK-NEXT:    [[VEC2:%.*]] = insertelement <16 x i16> [[VEC1]], i16 [[S]], i32 12
-; CHECK-NEXT:    store <16 x i16> [[VEC2]], ptr [[Q]], align 32
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <16 x i16>, ptr [[Q:%.*]], i32 0, i32 8
+; CHECK-NEXT:    store i16 [[S:%.*]], ptr [[TMP0]], align 16
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <16 x i16>, ptr [[Q]], i32 0, i32 12
+; CHECK-NEXT:    store i16 [[S]], ptr [[TMP1]], align 8
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1059,10 +1067,10 @@ define void @insert_store_nonconst_large_alignment2(ptr %q, i32 zeroext %s, i32
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 4
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT:    [[I:%.*]] = load <4 x i32>, ptr [[Q:%.*]], align 128
-; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <4 x i32> [[I]], i32 [[S:%.*]], i32 [[IDX1]]
-; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <4 x i32> [[VECINS1]], i32 [[S]], i32 [[IDX2]]
-; CHECK-NEXT:    store <4 x i32> [[VECINS2]], ptr [[Q]], align 128
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT:    store i32 [[S:%.*]], ptr [[TMP0]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <4 x i32>, ptr [[Q]], i32 0, i32 [[IDX1]]
+; CHECK-NEXT:    store i32 [[S]], ptr [[TMP1]], align 4
 ; CHECK-NEXT:    ret void
 ;
 entry:
@@ -1083,10 +1091,10 @@ define void @insert_store_nonconst_align_maximum_8_2(ptr %q, i64 %s, i32 %idx1,
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT:    [[I:%.*]] = load <8 x i64>, ptr [[Q:%.*]], align 8
-; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <8 x i64> [[I]], i64 [[S:%.*]], i32 [[IDX1]]
-; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <8 x i64> [[VECINS1]], i64 [[S]], i32 [[IDX2]]
-; CHECK-NEXT:    store <8 x i64> [[VECINS2]], ptr [[Q]], align 8
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT:    store i64 [[S:%.*]], ptr [[TMP1]], align 8
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q]], i32 0, i32 [[IDX1]]
+; CHECK-NEXT:    store i64 [[S]], ptr [[TMP2]], align 8
 ; CHECK-NEXT:    ret void
 ;
   %cmp1 = icmp ult i32 %idx1, 2
@@ -1106,10 +1114,10 @@ define void @insert_store_nonconst_align_maximum_4_2(ptr %q, i64 %s, i32 %idx1,
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT:    [[I:%.*]] = load <8 x i64>, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <8 x i64> [[I]], i64 [[S:%.*]], i32 [[IDX1]]
-; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <8 x i64> [[VECINS1]], i64 [[S]], i32 [[IDX2]]
-; CHECK-NEXT:    store <8 x i64> [[VECINS2]], ptr [[Q]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT:    store i64 [[S:%.*]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q]], i32 0, i32 [[IDX1]]
+; CHECK-NEXT:    store i64 [[S]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %cmp1 = icmp ult i32 %idx1, 2
@@ -1129,10 +1137,10 @@ define void @insert_store_nonconst_align_larger_2(ptr %q, i64 %s, i32 %idx1, i32
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i32 [[IDX2:%.*]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP1]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[CMP2]])
-; CHECK-NEXT:    [[I:%.*]] = load <8 x i64>, ptr [[Q:%.*]], align 4
-; CHECK-NEXT:    [[VECINS1:%.*]] = insertelement <8 x i64> [[I]], i64 [[S:%.*]], i32 [[IDX1]]
-; CHECK-NEXT:    [[VECINS2:%.*]] = insertelement <8 x i64> [[VECINS1]], i64 [[S]], i32 [[IDX2]]
-; CHECK-NEXT:    store <8 x i64> [[VECINS2]], ptr [[Q]], align 2
+; CHECK-NEXT:    [[TMP1:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q:%.*]], i32 0, i32 [[IDX2]]
+; CHECK-NEXT:    store i64 [[S:%.*]], ptr [[TMP1]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds <8 x i64>, ptr [[Q]], i32 0, i32 [[IDX1]]
+; CHECK-NEXT:    store i64 [[S]], ptr [[TMP2]], align 4
 ; CHECK-NEXT:    ret void
 ;
   %cmp1 = icmp ult i32 %idx1, 2



More information about the llvm-commits mailing list