[llvm] [VectorCombine] vectorizeLoadInsert - fix shuffle when inserting into non-poison undef value (PR #119906)
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Fri Dec 13 09:55:09 PST 2024
https://github.com/RKSimon updated https://github.com/llvm/llvm-project/pull/119906
>From 81e78500cb515e9a05b95ab37d0bde9634a501dd Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev at redking.me.uk>
Date: Fri, 13 Dec 2024 16:45:08 +0000
Subject: [PATCH] [VectorCombine] vectorizeLoadInsert - only fold when
inserting into a poison vector
We have corresponding poison tests in the "-inseltpoison.ll" sibling test files.
Fixes #119900
---
.../Transforms/Vectorize/VectorCombine.cpp | 4 +-
.../VectorCombine/AMDGPU/as-transition.ll | 4 +-
.../test/Transforms/VectorCombine/X86/load.ll | 124 +++++++++---------
3 files changed, 63 insertions(+), 69 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 09489e24984530..9bbd6590d099eb 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -179,8 +179,8 @@ bool VectorCombine::vectorizeLoadInsert(Instruction &I) {
// Match insert into fixed vector of scalar value.
// TODO: Handle non-zero insert index.
Value *Scalar;
- if (!match(&I, m_InsertElt(m_Undef(), m_Value(Scalar), m_ZeroInt())) ||
- !Scalar->hasOneUse())
+ if (!match(&I,
+ m_InsertElt(m_Poison(), m_OneUse(m_Value(Scalar)), m_ZeroInt())))
return false;
// Optionally match an extract from another vector.
diff --git a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
index 94b8c98a80df5c..83a0490b42a79a 100644
--- a/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
+++ b/llvm/test/Transforms/VectorCombine/AMDGPU/as-transition.ll
@@ -12,8 +12,8 @@ define protected amdgpu_kernel void @load_from_other_as(ptr nocapture nonnull %r
; CHECK-NEXT: bb:
; CHECK-NEXT: [[A:%.*]] = alloca [[STRUCT_HOGE:%.*]], align 4, addrspace(5)
; CHECK-NEXT: [[TMP0:%.*]] = addrspacecast ptr addrspace(5) [[A]] to ptr
-; CHECK-NEXT: [[TMP1:%.*]] = load <1 x float>, ptr [[TMP0]], align 4
-; CHECK-NEXT: [[E:%.*]] = shufflevector <1 x float> [[TMP1]], <1 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[D:%.*]] = load float, ptr [[TMP0]], align 4
+; CHECK-NEXT: [[E:%.*]] = insertelement <4 x float> undef, float [[D]], i32 0
; CHECK-NEXT: store <4 x float> [[E]], ptr [[RESULTPTR:%.*]], align 16
; CHECK-NEXT: ret void
;
diff --git a/llvm/test/Transforms/VectorCombine/X86/load.ll b/llvm/test/Transforms/VectorCombine/X86/load.ll
index bdd05a1a37c70f..b12104c5c673ea 100644
--- a/llvm/test/Transforms/VectorCombine/X86/load.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/load.ll
@@ -158,8 +158,8 @@ define double @larger_fp_scalar_256bit_vec(ptr align 32 dereferenceable(32) %p)
define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @load_f32_insert_v4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
; CHECK-NEXT: ret <4 x float> [[R]]
;
%s = load float, ptr %p, align 4
@@ -169,8 +169,8 @@ define <4 x float> @load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) n
define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @casted_load_f32_insert_v4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
; CHECK-NEXT: ret <4 x float> [[R]]
;
%s = load float, ptr %p, align 4
@@ -182,8 +182,8 @@ define <4 x float> @casted_load_f32_insert_v4f32(ptr align 4 dereferenceable(16)
define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @load_i32_insert_v4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%s = load i32, ptr %p, align 4
@@ -195,8 +195,8 @@ define <4 x i32> @load_i32_insert_v4i32(ptr align 16 dereferenceable(16) %p) nof
define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @casted_load_i32_insert_v4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i32 0
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%s = load i32, ptr %p, align 4
@@ -208,8 +208,8 @@ define <4 x i32> @casted_load_i32_insert_v4i32(ptr align 4 dereferenceable(16) %
define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @gep00_load_f32_insert_v4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 16
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
; CHECK-NEXT: ret <4 x float> [[R]]
;
%s = load float, ptr %p, align 16
@@ -221,8 +221,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32(ptr align 16 dereferenceable(16)
define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @gep00_load_f32_insert_v4f32_addrspace(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr addrspace(44) [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load float, ptr addrspace(44) [[P:%.*]], align 16
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i64 0
; CHECK-NEXT: ret <4 x float> [[R]]
;
%s = load float, ptr addrspace(44) %p, align 16
@@ -235,8 +235,8 @@ define <4 x float> @gep00_load_f32_insert_v4f32_addrspace(ptr addrspace(44) alig
define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %p) nofree nosync {
; CHECK-LABEL: @gep01_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 2
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
@@ -248,16 +248,11 @@ define <8 x i16> @gep01_load_i16_insert_v8i16(ptr align 16 dereferenceable(18) %
; Can't safely load the offset vector, but can load+shuffle if it is profitable.
define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable(17) %p) nofree nosync {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2
-; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; SSE2-NEXT: ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 16
-; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT: ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref(
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 2
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
%s = load i16, ptr %gep, align 2
@@ -268,16 +263,11 @@ define <8 x i16> @gep01_load_i16_insert_v8i16_deref(ptr align 16 dereferenceable
; Verify that alignment of the new load is not over-specified.
define <8 x i16> @gep01_load_i16_insert_v8i16_deref_minalign(ptr align 2 dereferenceable(16) %p) nofree nosync {
-; SSE2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
-; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 8
-; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; SSE2-NEXT: ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 2
-; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT: ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep01_load_i16_insert_v8i16_deref_minalign(
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 0, i64 1
+; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 8
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 0, i64 1
%s = load i16, ptr %gep, align 8
@@ -304,8 +294,9 @@ define <4 x i32> @gep01_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceabl
define <4 x i32> @gep012_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceable(20) %p) nofree nosync {
; CHECK-LABEL: @gep012_bitcast_load_i32_insert_v4i32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 1
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <4 x i32> <i32 3, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <16 x i8>, ptr [[P:%.*]], i64 0, i64 12
+; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[GEP]], align 1
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x i32> undef, i32 [[S]], i64 0
; CHECK-NEXT: ret <4 x i32> [[R]]
;
%gep = getelementptr inbounds <16 x i8>, ptr %p, i64 0, i64 12
@@ -336,8 +327,8 @@ define <4 x i32> @gep013_bitcast_load_i32_insert_v4i32(ptr align 1 dereferenceab
define <8 x i16> @gep10_load_i16_insert_v8i16(ptr align 16 dereferenceable(32) %p) nofree nosync {
; CHECK-LABEL: @gep10_load_i16_insert_v8i16(
; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <8 x i16>, ptr [[P:%.*]], i64 1, i64 0
-; CHECK-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[GEP]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[GEP]], align 16
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <8 x i16>, ptr %p, i64 1, i64 0
@@ -439,8 +430,8 @@ define <4 x float> @load_f32_insert_v4f32_volatile(ptr align 16 dereferenceable(
define <4 x float> @load_f32_insert_v4f32_align(ptr align 1 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @load_f32_insert_v4f32_align(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 4
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
; CHECK-NEXT: ret <4 x float> [[R]]
;
%s = load float, ptr %p, align 4
@@ -463,8 +454,8 @@ define <4 x float> @load_f32_insert_v4f32_deref(ptr align 4 dereferenceable(15)
define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @load_i32_insert_v8i32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%s = load i32, ptr %p, align 4
@@ -474,8 +465,8 @@ define <8 x i32> @load_i32_insert_v8i32(ptr align 16 dereferenceable(16) %p) nof
define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @casted_load_i32_insert_v8i32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x i32>, ptr [[P:%.*]], align 4
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> poison, <8 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load i32, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i32> undef, i32 [[S]], i32 0
; CHECK-NEXT: ret <8 x i32> [[R]]
;
%s = load i32, ptr %p, align 4
@@ -485,8 +476,8 @@ define <8 x i32> @casted_load_i32_insert_v8i32(ptr align 4 dereferenceable(16) %
define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @load_f32_insert_v16f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <16 x i32> <i32 0, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <16 x float> undef, float [[S]], i32 0
; CHECK-NEXT: ret <16 x float> [[R]]
;
%s = load float, ptr %p, align 4
@@ -496,8 +487,8 @@ define <16 x float> @load_f32_insert_v16f32(ptr align 16 dereferenceable(16) %p)
define <2 x float> @load_f32_insert_v2f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @load_f32_insert_v2f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: [[S:%.*]] = load float, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[R:%.*]] = insertelement <2 x float> undef, float [[S]], i32 0
; CHECK-NEXT: ret <2 x float> [[R]]
;
%s = load float, ptr %p, align 4
@@ -549,8 +540,9 @@ define void @PR47558_multiple_use_load(ptr nocapture nonnull %resultptr, ptr noc
define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
; CHECK-LABEL: @load_v2f32_extract_insert_v4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[L:%.*]] = load <2 x float>, ptr [[P:%.*]], align 4
+; CHECK-NEXT: [[S:%.*]] = extractelement <2 x float> [[L]], i32 0
+; CHECK-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
; CHECK-NEXT: ret <4 x float> [[R]]
;
%l = load <2 x float>, ptr %p, align 4
@@ -560,10 +552,17 @@ define <4 x float> @load_v2f32_extract_insert_v4f32(ptr align 16 dereferenceable
}
define <4 x float> @load_v8f32_extract_insert_v4f32(ptr align 16 dereferenceable(16) %p) nofree nosync {
-; CHECK-LABEL: @load_v8f32_extract_insert_v4f32(
-; CHECK-NEXT: [[TMP1:%.*]] = load <4 x float>, ptr [[P:%.*]], align 16
-; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> poison, <4 x i32> <i32 0, i32 poison, i32 poison, i32 poison>
-; CHECK-NEXT: ret <4 x float> [[R]]
+; SSE2-LABEL: @load_v8f32_extract_insert_v4f32(
+; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <8 x float>, ptr [[P:%.*]], i32 0, i32 0
+; SSE2-NEXT: [[S:%.*]] = load float, ptr [[TMP1]], align 4
+; SSE2-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; SSE2-NEXT: ret <4 x float> [[R]]
+;
+; AVX2-LABEL: @load_v8f32_extract_insert_v4f32(
+; AVX2-NEXT: [[L:%.*]] = load <8 x float>, ptr [[P:%.*]], align 4
+; AVX2-NEXT: [[S:%.*]] = extractelement <8 x float> [[L]], i32 0
+; AVX2-NEXT: [[R:%.*]] = insertelement <4 x float> undef, float [[S]], i32 0
+; AVX2-NEXT: ret <4 x float> [[R]]
;
%l = load <8 x float>, ptr %p, align 4
%s = extractelement <8 x float> %l, i32 0
@@ -589,17 +588,12 @@ define <8 x i32> @load_v1i32_extract_insert_v8i32_extra_use(ptr align 16 derefer
; Can't safely load the offset vector, but can load+shuffle if it is profitable.
define <8 x i16> @gep1_load_v2i16_extract_insert_v8i16(ptr align 1 dereferenceable(16) %p) nofree nosync {
-; SSE2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; SSE2-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1
-; SSE2-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0
-; SSE2-NEXT: [[S:%.*]] = load i16, ptr [[TMP1]], align 8
-; SSE2-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
-; SSE2-NEXT: ret <8 x i16> [[R]]
-;
-; AVX2-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
-; AVX2-NEXT: [[TMP1:%.*]] = load <8 x i16>, ptr [[P:%.*]], align 4
-; AVX2-NEXT: [[R:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
-; AVX2-NEXT: ret <8 x i16> [[R]]
+; CHECK-LABEL: @gep1_load_v2i16_extract_insert_v8i16(
+; CHECK-NEXT: [[GEP:%.*]] = getelementptr inbounds <2 x i16>, ptr [[P:%.*]], i64 1
+; CHECK-NEXT: [[TMP1:%.*]] = getelementptr inbounds <2 x i16>, ptr [[GEP]], i32 0, i32 0
+; CHECK-NEXT: [[S:%.*]] = load i16, ptr [[TMP1]], align 8
+; CHECK-NEXT: [[R:%.*]] = insertelement <8 x i16> undef, i16 [[S]], i64 0
+; CHECK-NEXT: ret <8 x i16> [[R]]
;
%gep = getelementptr inbounds <2 x i16>, ptr %p, i64 1
%l = load <2 x i16>, ptr %gep, align 8
More information about the llvm-commits
mailing list