[llvm] [VectorCombine] Allow shuffling with bitcast for not multiple offset for loadsize (PR #119139)

via llvm-commits llvm-commits at lists.llvm.org
Sun Dec 8 10:23:38 PST 2024


https://github.com/ParkHanbum created https://github.com/llvm/llvm-project/pull/119139

Previously, vectorization for load-insert failed when the Offset was not
a multiple of the Load type size.

This patch allow it in two steps,
1. Vectorize it using a common multiple of Offset and LoadSize.
2. Bitcast to fit

Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ


>From 3faaac4ac351adccdc60084bd26dceb074ac9823 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 7 Dec 2024 07:00:41 +0900
Subject: [PATCH 1/2] add test-cases

---
 .../VectorCombine/X86/load-inseltpoison.ll    |  82 ++++++++++-
 .../test/Transforms/VectorCombine/X86/load.ll | 134 +++++++++++++++++-
 2 files changed, 212 insertions(+), 4 deletions(-)

diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index 937d4043adc0c4..dc915b1269efec 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -306,8 +306,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
 ; must be a multiple of element size.
 ; TODO: Could bitcast around this limitation.
 
-define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) nofree nosync {
-; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
+define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
 ; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
@@ -319,6 +319,84 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceabl
   ret <4 x i32> %r
 }
 
+define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> poison, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> poison, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> poison, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> poison, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> poison, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) nofree nosync {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> poison, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index bdd05a1a37c70f..5a7690f8dbe28f 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -289,8 +289,8 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
 ; must be a multiple of element size.
 ; TODO: Could bitcast around this limitation.
 
-define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i32_insert_v4i32(
+define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
 ; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
 ; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
 ; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
@@ -302,6 +302,84 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceabl
   ret <4 x i32> %r
 }
 
+define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> undef, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> undef, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> undef, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> undef, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> undef, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> undef, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
 define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
 ; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
 ; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
@@ -331,6 +409,58 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
   ret <4 x i32> %r
 }
 
+define <4 x i32> @gep07_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep07_bitcast_load_i32_from_v8i16_insert_v4i32(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 7
+; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; CHECK-NEXT:    ret <4 x i32> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 7
+  %s = load i32, ptr %gep, align 1
+  %r = insertelement <4 x i32> undef, i32 %s, i64 0
+  ret <4 x i32> %r
+}
+
+define <2 x i64> @gep03_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep03_bitcast_load_i32_from_v4i32_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 3
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 3
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> undef, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @gep09_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) #0 {
+; CHECK-LABEL: @gep09_bitcast_load_i64_from_v16i8_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 9
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 9
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> undef, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
+define <2 x i64> @gep05_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
+; CHECK-LABEL: @gep05_bitcast_load_i64_from_v8i16_insert_v2i64(
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
+; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; CHECK-NEXT:    ret <2 x i64> [[R]]
+;
+  %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
+  %s = load i64, ptr %gep, align 1
+  %r = insertelement <2 x i64> undef, i64 %s, i64 0
+  ret <2 x i64> %r
+}
+
 ; If there are enough dereferenceable bytes, we can offset the vector load.
 
 define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {

>From 8eaeb0b7f3bece992cd8b91167939a32eb1ad7dc Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 7 Dec 2024 06:00:08 +0900
Subject: [PATCH 2/2] [VectorCombine] Allow shuffling with bitcast for not
 multiple offset for loadsize

Previously, vectorization for load-insert failed when the Offset was not
a multiple of the Load type size.

This patch allow it in two steps,
1. Vectorize it using a common multiple of Offset and LoadSize.
2. Bitcast to fit

Alive2: https://alive2.llvm.org/ce/z/Kgr9HQ
---
 .../Transforms/Vectorize/VectorCombine.cpp    |  76 +++++++++---
 .../VectorCombine/X86/load-inseltpoison.ll    | 108 ++++++++++++------
 .../test/Transforms/VectorCombine/X86/load.ll | 106 +++++++++++------
 3 files changed, 199 insertions(+), 91 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index b9caf8c0df9be1..85b9feb6c51e03 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -190,6 +190,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   if (!canWidenLoad(Load, TTI))
     return false;
 
+  auto MaxCommonDivisor = [](int n) {
+    if (n % 4 == 0)
+      return 4;
+    if (n % 2 == 0)
+      return 2;
+    else
+      return 1;
+  };
+
   Type *ScalarTy = Scalar->getType();
   uint64_t ScalarSize = ScalarTy->getPrimitiveSizeInBits();
   unsigned MinVectorSize = TTI.getMinVectorRegisterBitWidth();
@@ -204,6 +213,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   unsigned MinVecNumElts = MinVectorSize / ScalarSize;
   auto *MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
   unsigned OffsetEltIndex = 0;
+  unsigned VectorRange = 0;
+  bool NeedCast = false;
   Align Alignment = Load->getAlign();
   if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
                                    &DT)) {
@@ -220,15 +231,27 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
     if (Offset.isNegative())
       return false;
 
-    // The offset must be a multiple of the scalar element to shuffle cleanly
-    // in the element's size.
+    // If Offset is multiple of a Scalar element, it can be shuffled to the
+    // element's size; otherwise, Offset and Scalar must be shuffled to the
+    // appropriate element size for both.
     uint64_t ScalarSizeInBytes = ScalarSize / 8;
-    if (Offset.urem(ScalarSizeInBytes) != 0)
-      return false;
+    if (auto UnalignedBytes = Offset.urem(ScalarSizeInBytes);
+        UnalignedBytes != 0) {
+      uint64_t OldScalarSizeInBytes = ScalarSizeInBytes;
+      // Assign the greatest common divisor between UnalignedBytes and Offset to
+      // ScalarSizeInBytes
+      ScalarSizeInBytes = MaxCommonDivisor(UnalignedBytes);
+      ScalarSize = ScalarSizeInBytes * 8;
+      VectorRange = OldScalarSizeInBytes / ScalarSizeInBytes;
+      MinVecNumElts = MinVectorSize / ScalarSize;
+      ScalarTy = Type::getIntNTy(I.getContext(), ScalarSize);
+      MinVecTy = VectorType::get(ScalarTy, MinVecNumElts, false);
+      NeedCast = true;
+    }
 
-    // If we load MinVecNumElts, will our target element still be loaded?
     OffsetEltIndex = Offset.udiv(ScalarSizeInBytes).getZExtValue();
-    if (OffsetEltIndex >= MinVecNumElts)
+    // If we load MinVecNumElts, will our target element still be loaded?
+    if (OffsetEltIndex + VectorRange >= MinVecNumElts)
       return false;
 
     if (!isSafeToLoadUnconditionally(SrcPtr, MinVecTy, Align(1), *DL, Load, &AC,
@@ -246,12 +269,15 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   Alignment = std::max(SrcPtr->getPointerAlignment(*DL), Alignment);
   Type *LoadTy = Load->getType();
   unsigned AS = Load->getPointerAddressSpace();
+  auto VecTy = cast<InsertElementInst>(&I)->getType();
+
   InstructionCost OldCost =
       TTI.getMemoryOpCost(Instruction::Load, LoadTy, Alignment, AS);
-  APInt DemandedElts = APInt::getOneBitSet(MinVecNumElts, 0);
+  APInt DemandedElts =
+      APInt::getOneBitSet(VecTy->getElementCount().getFixedValue(), 0);
   TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
   OldCost +=
-      TTI.getScalarizationOverhead(MinVecTy, DemandedElts,
+      TTI.getScalarizationOverhead(VecTy, DemandedElts,
                                    /* Insert */ true, HasExtract, CostKind);
 
   // New pattern: load VecPtr
@@ -264,14 +290,28 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // We assume this operation has no cost in codegen if there was no offset.
   // Note that we could use freeze to avoid poison problems, but then we might
   // still need a shuffle to change the vector size.
-  auto *Ty = cast<FixedVectorType>(I.getType());
-  unsigned OutputNumElts = Ty->getNumElements();
-  SmallVector<int, 16> Mask(OutputNumElts, PoisonMaskElem);
-  assert(OffsetEltIndex < MinVecNumElts && "Address offset too big");
-  Mask[0] = OffsetEltIndex;
+  SmallVector<int> Mask;
+  assert(OffsetEltIndex + VectorRange < MinVecNumElts &&
+         "Address offset too big");
+  if (!NeedCast) {
+    auto *Ty = cast<FixedVectorType>(I.getType());
+    unsigned OutputNumElts = Ty->getNumElements();
+    Mask.assign(OutputNumElts, PoisonMaskElem);
+    Mask[0] = OffsetEltIndex;
+  } else {
+    Mask.assign(MinVecNumElts, PoisonMaskElem);
+    for (unsigned InsertPos = 0; InsertPos < VectorRange; InsertPos++)
+      Mask[InsertPos] = OffsetEltIndex++;
+  }
+
   if (OffsetEltIndex)
     NewCost += TTI.getShuffleCost(TTI::SK_PermuteSingleSrc, MinVecTy, Mask);
 
+  if (NeedCast)
+    NewCost += TTI.getCastInstrCost(Instruction::BitCast, I.getType(), MinVecTy,
+                                    TargetTransformInfo::CastContextHint::None,
+                                    TargetTransformInfo::TCK_RecipThroughput);
+
   // We can aggressively convert to the vector form because the backend can
   // invert this transform if it does not result in a performance win.
   if (OldCost < NewCost || !NewCost.isValid())
@@ -280,12 +320,16 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
   // It is safe and potentially profitable to load a vector directly:
   // inselt undef, load Scalar, 0 --> load VecPtr
   IRBuilder<> Builder(Load);
+  Value *Result;
   Value *CastedPtr =
       Builder.CreatePointerBitCastOrAddrSpaceCast(SrcPtr, Builder.getPtrTy(AS));
-  Value *VecLd = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
-  VecLd = Builder.CreateShuffleVector(VecLd, Mask);
+  Result = Builder.CreateAlignedLoad(MinVecTy, CastedPtr, Alignment);
+  Result = Builder.CreateShuffleVector(Result, Mask);
 
-  replaceValue(I, *VecLd);
+  if (NeedCast)
+    Result = Builder.CreateBitOrPointerCast(Result, I.getType());
+
+  replaceValue(I, *Result);
   ++NumVecLoad;
   return true;
 }
diff --git a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
index dc915b1269efec..5c3615cffd8e43 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load-inseltpoison.ll
@@ -302,16 +302,18 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
   ret <8 x i16> %r
 }
 
-; Negative test - if we are shuffling a load from the base pointer, the address offset
-; must be a multiple of element size.
-; TODO: Could bitcast around this limitation.
-
 define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
   %s = load i32, ptr %gep, align 1
@@ -320,11 +322,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
 }
 
 define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
   %s = load i64, ptr %gep, align 1
@@ -333,11 +341,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 der
 }
 
 define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
   %s = load i32, ptr %gep, align 1
@@ -346,11 +360,17 @@ define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
 }
 
 define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i32, ptr %gep, align 1
@@ -359,11 +379,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der
 }
 
 define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i64, ptr %gep, align 1
@@ -372,11 +398,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 der
 }
 
 define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> poison, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
   %s = load i32, ptr %gep, align 1
@@ -384,11 +416,11 @@ define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der
   ret <4 x i32> %r
 }
 
-define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) nofree nosync {
+define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> poison, i64 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index 5a7690f8dbe28f..994ef1f9c66d89 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -285,16 +285,18 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 derefer
   ret <8 x i16> %r
 }
 
-; Negative test - if we are shuffling a load from the base pointer, the address offset
-; must be a multiple of element size.
-; TODO: Could bitcast around this limitation.
-
 define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
   %s = load i32, ptr %gep, align 1
@@ -303,11 +305,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
 }
 
 define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 1
   %s = load i64, ptr %gep, align 1
@@ -316,11 +324,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v16i8_insert_v2i64(ptr align 1 der
 }
 
 define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 11
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <16 x i8>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <16 x i8> [[TMP1]], <16 x i8> poison, <16 x i32> <i32 11, i32 12, i32 13, i32 14, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <16 x i8> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 11
   %s = load i32, ptr %gep, align 1
@@ -329,11 +343,17 @@ define <4 x i32> @gep11_bitcast_load_i32_from_v16i8_insert_v4i32(ptr align 1 der
 }
 
 define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i32, ptr %gep, align 1
@@ -342,11 +362,17 @@ define <4 x i32> @gep01_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der
 }
 
 define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
-; CHECK-NEXT:    ret <2 x i64> [[R]]
+; SSE2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; SSE2-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; SSE2-NEXT:    ret <2 x i64> [[R]]
+;
+; AVX2-LABEL: @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 2, i32 3, i32 4, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <2 x i64>
+; AVX2-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
   %s = load i64, ptr %gep, align 1
@@ -355,11 +381,17 @@ define <2 x i64> @gep01_bitcast_load_i64_from_v8i16_insert_v2i64(ptr align 1 der
 }
 
 define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 dereferenceable(16) %p) {
-; CHECK-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
-; CHECK-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
-; CHECK-NEXT:    ret <4 x i32> [[R]]
+; SSE2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; SSE2-NEXT:    [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 5
+; SSE2-NEXT:    [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; SSE2-NEXT:    [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
+; SSE2-NEXT:    ret <4 x i32> [[R]]
+;
+; AVX2-LABEL: @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(
+; AVX2-NEXT:    [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 1
+; AVX2-NEXT:    [[TMP2:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT:    [[R:%.*]] = bitcast <8 x i16> [[TMP2]] to <4 x i32>
+; AVX2-NEXT:    ret <4 x i32> [[R]]
 ;
   %gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 5
   %s = load i32, ptr %gep, align 1
@@ -369,9 +401,9 @@ define <4 x i32> @gep05_bitcast_load_i32_from_v8i16_insert_v4i32(ptr align 1 der
 
 define <2 x i64> @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(ptr align 1 dereferenceable(16) %p) {
 ; CHECK-LABEL: @gep01_bitcast_load_i32_from_v4i32_insert_v2i64(
-; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds <4 x i32>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT:    [[S:%.*]] = load i64, ptr [[GEP]], align 1
-; CHECK-NEXT:    [[R:%.*]] = insertelement <2 x i64> undef, i64 [[S]], i64 0
+; CHECK-NEXT:    [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
+; CHECK-NEXT:    [[TMP2:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT:    [[R:%.*]] = bitcast <4 x i32> [[TMP2]] to <2 x i64>
 ; CHECK-NEXT:    ret <2 x i64> [[R]]
 ;
   %gep = getelementptr inbounds <4 x i32>, ptr %p, i64 0, i64 1



More information about the llvm-commits mailing list