[llvm] 434819c - [PhaseOrdering][X86] Add test coverage for #34072
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Thu Dec 19 09:32:39 PST 2024
Author: Simon Pilgrim
Date: 2024-12-19T17:32:18Z
New Revision: 434819c35f4e0168248a30677077fe7c8c8ab29c
URL: https://github.com/llvm/llvm-project/commit/434819c35f4e0168248a30677077fe7c8c8ab29c
DIFF: https://github.com/llvm/llvm-project/commit/434819c35f4e0168248a30677077fe7c8c8ab29c.diff
LOG: [PhaseOrdering][X86] Add test coverage for #34072
Add tests for horizontal add patterns with missing/undemanded elements - which typically prevents folding to the (add (shuffle a, b),(shuffle a, b)) optimal pattern
Added:
llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
new file mode 100644
index 00000000000000..664f144aa15ae3
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hadd.ll
@@ -0,0 +1,1241 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR34072 - failure to canonicalize to (add (shuffle a, b),(shuffle a, b)) for optimal horizontal add patterns (with undemanded elements)
+
+;
+; v8i16
+;
+
+define <8 x i16> @add_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @add_v8i16_01234567(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %a0 = extractelement <8 x i16> %a, i32 0
+ %a1 = extractelement <8 x i16> %a, i32 1
+ %a2 = extractelement <8 x i16> %a, i32 2
+ %a3 = extractelement <8 x i16> %a, i32 3
+ %a4 = extractelement <8 x i16> %a, i32 4
+ %a5 = extractelement <8 x i16> %a, i32 5
+ %a6 = extractelement <8 x i16> %a, i32 6
+ %a7 = extractelement <8 x i16> %a, i32 7
+ %a01 = add i16 %a0, %a1
+ %a23 = add i16 %a2, %a3
+ %a45 = add i16 %a4, %a5
+ %a67 = add i16 %a6, %a7
+ %b0 = extractelement <8 x i16> %b, i32 0
+ %b1 = extractelement <8 x i16> %b, i32 1
+ %b2 = extractelement <8 x i16> %b, i32 2
+ %b3 = extractelement <8 x i16> %b, i32 3
+ %b4 = extractelement <8 x i16> %b, i32 4
+ %b5 = extractelement <8 x i16> %b, i32 5
+ %b6 = extractelement <8 x i16> %b, i32 6
+ %b7 = extractelement <8 x i16> %b, i32 7
+ %b01 = add i16 %b0, %b1
+ %b23 = add i16 %b2, %b3
+ %b45 = add i16 %b4, %b5
+ %b67 = add i16 %b6, %b7
+ %hadd0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+ %hadd1 = insertelement <8 x i16> %hadd0, i16 %a23, i32 1
+ %hadd2 = insertelement <8 x i16> %hadd1, i16 %a45, i32 2
+ %hadd3 = insertelement <8 x i16> %hadd2, i16 %a67, i32 3
+ %hadd4 = insertelement <8 x i16> %hadd3, i16 %b01, i32 4
+ %hadd5 = insertelement <8 x i16> %hadd4, i16 %b23, i32 5
+ %hadd6 = insertelement <8 x i16> %hadd5, i16 %b45, i32 6
+ %hadd7 = insertelement <8 x i16> %hadd6, i16 %b67, i32 7
+ %result = shufflevector <8 x i16> %hadd7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %result
+}
+
+define <8 x i16> @add_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: @add_v8i16_u1234567(
+; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
+; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
+; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
+; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
+; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
+; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
+; SSE2-NEXT: [[A23:%.*]] = add i16 [[A2]], [[A3]]
+; SSE2-NEXT: [[A45:%.*]] = add i16 [[A4]], [[A5]]
+; SSE2-NEXT: [[A67:%.*]] = add i16 [[A6]], [[A7]]
+; SSE2-NEXT: [[HADD1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
+; SSE2-NEXT: [[HADD2:%.*]] = insertelement <8 x i16> [[HADD1]], i16 [[A45]], i64 2
+; SSE2-NEXT: [[HADD3:%.*]] = insertelement <8 x i16> [[HADD2]], i16 [[A67]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = add <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT: ret <8 x i16> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8i16_u1234567(
+; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
+; SSE4-NEXT: [[HADD1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: [[HADD32:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE4-NEXT: ret <8 x i16> [[RESULT]]
+;
+; AVX2-LABEL: @add_v8i16_u1234567(
+; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
+; AVX2-NEXT: [[HADD1:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> poison, <8 x i32> <i32 poison, i32 2, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+; AVX2-NEXT: [[HADD32:%.*]] = shufflevector <8 x i16> [[HADD1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 1, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
+; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX2-NEXT: ret <8 x i16> [[RESULT]]
+;
+; AVX512-LABEL: @add_v8i16_u1234567(
+; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> poison, <8 x i32> <i32 poison, i32 poison, i32 3, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP1:%.*]] = add <8 x i16> [[A]], [[SHIFT]]
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP4:%.*]] = add <8 x i16> [[TMP2]], [[TMP3]]
+; AVX512-NEXT: [[HADD32:%.*]] = shufflevector <8 x i16> [[TMP1]], <8 x i16> [[TMP4]], <8 x i32> <i32 poison, i32 2, i32 8, i32 9, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP7:%.*]] = add <8 x i16> [[TMP5]], [[TMP6]]
+; AVX512-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HADD32]], <8 x i16> [[TMP7]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; AVX512-NEXT: ret <8 x i16> [[RESULT]]
+;
+ %a0 = extractelement <8 x i16> %a, i32 0
+ %a1 = extractelement <8 x i16> %a, i32 1
+ %a2 = extractelement <8 x i16> %a, i32 2
+ %a3 = extractelement <8 x i16> %a, i32 3
+ %a4 = extractelement <8 x i16> %a, i32 4
+ %a5 = extractelement <8 x i16> %a, i32 5
+ %a6 = extractelement <8 x i16> %a, i32 6
+ %a7 = extractelement <8 x i16> %a, i32 7
+ %a01 = add i16 %a0, %a1
+ %a23 = add i16 %a2, %a3
+ %a45 = add i16 %a4, %a5
+ %a67 = add i16 %a6, %a7
+ %b0 = extractelement <8 x i16> %b, i32 0
+ %b1 = extractelement <8 x i16> %b, i32 1
+ %b2 = extractelement <8 x i16> %b, i32 2
+ %b3 = extractelement <8 x i16> %b, i32 3
+ %b4 = extractelement <8 x i16> %b, i32 4
+ %b5 = extractelement <8 x i16> %b, i32 5
+ %b6 = extractelement <8 x i16> %b, i32 6
+ %b7 = extractelement <8 x i16> %b, i32 7
+ %b01 = add i16 %b0, %b1
+ %b23 = add i16 %b2, %b3
+ %b45 = add i16 %b4, %b5
+ %b67 = add i16 %b6, %b7
+ %hadd0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+ %hadd1 = insertelement <8 x i16> %hadd0, i16 %a23, i32 1
+ %hadd2 = insertelement <8 x i16> %hadd1, i16 %a45, i32 2
+ %hadd3 = insertelement <8 x i16> %hadd2, i16 %a67, i32 3
+ %hadd4 = insertelement <8 x i16> %hadd3, i16 %b01, i32 4
+ %hadd5 = insertelement <8 x i16> %hadd4, i16 %b23, i32 5
+ %hadd6 = insertelement <8 x i16> %hadd5, i16 %b45, i32 6
+ %hadd7 = insertelement <8 x i16> %hadd6, i16 %b67, i32 7
+ %result = shufflevector <8 x i16> %hadd7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %result
+}
+
+;
+; v4i32
+;
+
+define <4 x i32> @add_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_0123(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_u123(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x i32> [[RESULT1]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_0u23(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x i32> [[RESULT1]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: @add_v4i32_01u3(
+; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE2-NEXT: ret <4 x i32> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4i32_01u3(
+; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE4-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; SSE4-NEXT: ret <4 x i32> [[RESULT]]
+;
+; AVX2-LABEL: @add_v4i32_01u3(
+; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX2-NEXT: [[TMP1:%.*]] = add <4 x i32> [[SHIFT]], [[B]]
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x i32> [[TMP4]], <4 x i32> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; AVX2-NEXT: ret <4 x i32> [[RESULT]]
+;
+; AVX512-LABEL: @add_v4i32_01u3(
+; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; AVX512-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; AVX512-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX512-NEXT: ret <4 x i32> [[RESULT1]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_012u(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = add <4 x i32> [[A]], [[SHIFT]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = add <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x i32> [[TMP1]], <4 x i32> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; CHECK-NEXT: ret <4 x i32> [[RESULT1]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_uu23(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT: [[RESULT1:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x i32> [[RESULT1]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @add_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @add_v4i32_01uu(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = add <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %hadd0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <4 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <4 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <4 x i32> %hadd2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hadd3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ ret <4 x i32> %result
+}
+
+;
+; v8i32
+;
+
+define <8 x i32> @add_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
+; SSE-LABEL: @add_v8i32_01234567(
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT: [[TMP5:%.*]] = add <4 x i32> [[TMP1]], [[TMP3]]
+; SSE-NEXT: [[TMP6:%.*]] = add <4 x i32> [[TMP2]], [[TMP4]]
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x i32> [[TMP5]], <4 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: ret <8 x i32> [[TMP7]]
+;
+; AVX-LABEL: @add_v8i32_01234567(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; AVX-NEXT: ret <8 x i32> [[TMP3]]
+;
+ %a0 = extractelement <8 x i32> %a, i32 0
+ %a1 = extractelement <8 x i32> %a, i32 1
+ %a2 = extractelement <8 x i32> %a, i32 2
+ %a3 = extractelement <8 x i32> %a, i32 3
+ %a4 = extractelement <8 x i32> %a, i32 4
+ %a5 = extractelement <8 x i32> %a, i32 5
+ %a6 = extractelement <8 x i32> %a, i32 6
+ %a7 = extractelement <8 x i32> %a, i32 7
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %a45 = add i32 %a4, %a5
+ %a67 = add i32 %a6, %a7
+ %b0 = extractelement <8 x i32> %b, i32 0
+ %b1 = extractelement <8 x i32> %b, i32 1
+ %b2 = extractelement <8 x i32> %b, i32 2
+ %b3 = extractelement <8 x i32> %b, i32 3
+ %b4 = extractelement <8 x i32> %b, i32 4
+ %b5 = extractelement <8 x i32> %b, i32 5
+ %b6 = extractelement <8 x i32> %b, i32 6
+ %b7 = extractelement <8 x i32> %b, i32 7
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %b45 = add i32 %b4, %b5
+ %b67 = add i32 %b6, %b7
+ %hadd0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <8 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <8 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <8 x i32> %hadd2, i32 %b23, i32 3
+ %hadd4 = insertelement <8 x i32> %hadd3, i32 %a45, i32 4
+ %hadd5 = insertelement <8 x i32> %hadd4, i32 %a67, i32 5
+ %hadd6 = insertelement <8 x i32> %hadd5, i32 %b45, i32 6
+ %hadd7 = insertelement <8 x i32> %hadd6, i32 %b67, i32 7
+ %result = shufflevector <8 x i32> %hadd7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %result
+}
+
+define <8 x i32> @add_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: @add_v8i32_01234u67(
+; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE2-NEXT: ret <8 x i32> [[RESULT]]
+;
+; SSE4-LABEL: @add_v8i32_01234u67(
+; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT: [[A45:%.*]] = add i32 [[A4]], [[A5]]
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = add <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[HADD4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = add <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT: ret <8 x i32> [[RESULT]]
+;
+; AVX-LABEL: @add_v8i32_01234u67(
+; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP1:%.*]] = add <8 x i32> [[A]], [[SHIFT]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = add <8 x i32> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[HADD4:%.*]] = shufflevector <8 x i32> [[TMP4]], <8 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 12, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP7:%.*]] = add <8 x i32> [[TMP5]], [[TMP6]]
+; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HADD4]], <8 x i32> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; AVX-NEXT: ret <8 x i32> [[RESULT]]
+;
+ %a0 = extractelement <8 x i32> %a, i32 0
+ %a1 = extractelement <8 x i32> %a, i32 1
+ %a2 = extractelement <8 x i32> %a, i32 2
+ %a3 = extractelement <8 x i32> %a, i32 3
+ %a4 = extractelement <8 x i32> %a, i32 4
+ %a5 = extractelement <8 x i32> %a, i32 5
+ %a6 = extractelement <8 x i32> %a, i32 6
+ %a7 = extractelement <8 x i32> %a, i32 7
+ %a01 = add i32 %a0, %a1
+ %a23 = add i32 %a2, %a3
+ %a45 = add i32 %a4, %a5
+ %a67 = add i32 %a6, %a7
+ %b0 = extractelement <8 x i32> %b, i32 0
+ %b1 = extractelement <8 x i32> %b, i32 1
+ %b2 = extractelement <8 x i32> %b, i32 2
+ %b3 = extractelement <8 x i32> %b, i32 3
+ %b4 = extractelement <8 x i32> %b, i32 4
+ %b5 = extractelement <8 x i32> %b, i32 5
+ %b6 = extractelement <8 x i32> %b, i32 6
+ %b7 = extractelement <8 x i32> %b, i32 7
+ %b01 = add i32 %b0, %b1
+ %b23 = add i32 %b2, %b3
+ %b45 = add i32 %b4, %b5
+ %b67 = add i32 %b6, %b7
+ %hadd0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+ %hadd1 = insertelement <8 x i32> %hadd0, i32 %a23, i32 1
+ %hadd2 = insertelement <8 x i32> %hadd1, i32 %b01, i32 2
+ %hadd3 = insertelement <8 x i32> %hadd2, i32 %b23, i32 3
+ %hadd4 = insertelement <8 x i32> %hadd3, i32 %a45, i32 4
+ %hadd5 = insertelement <8 x i32> %hadd4, i32 %a67, i32 5
+ %hadd6 = insertelement <8 x i32> %hadd5, i32 %b45, i32 6
+ %hadd7 = insertelement <8 x i32> %hadd6, i32 %b67, i32 7
+ %result = shufflevector <8 x i32> %hadd7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
+ ret <8 x i32> %result
+}
+
+;
+; v4f32
+;
+
+define <4 x float> @add_v4f32_0123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_0123(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_u123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_u123(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 poison, i32 2, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[RESULT1]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_0u23(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 poison, i32 4, i32 5>
+; CHECK-NEXT: ret <4 x float> [[RESULT1]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @add_v4f32_01u3(
+; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; SSE2-NEXT: ret <4 x float> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4f32_01u3(
+; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; SSE4-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; SSE4-NEXT: ret <4 x float> [[RESULT]]
+;
+; AVX2-LABEL: @add_v4f32_01u3(
+; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 poison, i32 2>
+; AVX2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[SHIFT]], [[B]]
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 7>
+; AVX2-NEXT: ret <4 x float> [[RESULT]]
+;
+; AVX512-LABEL: @add_v4f32_01u3(
+; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 poison, i32 6, i32 poison>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 poison, i32 7, i32 poison>
+; AVX512-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX512-NEXT: ret <4 x float> [[RESULT1]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_012u(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @add_v4f32_012u(
+; SSE2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; SSE2-NEXT: ret <4 x float> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4f32_012u(
+; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE4-NEXT: ret <4 x float> [[RESULT]]
+;
+; AVX2-LABEL: @add_v4f32_012u(
+; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[B]], [[SHIFT]]
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; AVX2-NEXT: ret <4 x float> [[RESULT]]
+;
+; AVX512-LABEL: @add_v4f32_012u(
+; AVX512-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP1:%.*]] = fadd <4 x float> [[A]], [[SHIFT]]
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B:%.*]], <4 x i32> <i32 2, i32 4, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 3, i32 5, i32 poison, i32 poison>
+; AVX512-NEXT: [[TMP4:%.*]] = fadd <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT: [[RESULT1:%.*]] = shufflevector <4 x float> [[TMP1]], <4 x float> [[TMP4]], <4 x i32> <i32 0, i32 4, i32 5, i32 poison>
+; AVX512-NEXT: ret <4 x float> [[RESULT1]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+ ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_uu23(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 3>
+; CHECK-NEXT: [[RESULT1:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x float> [[RESULT1]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @add_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @add_v4f32_01uu(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 0, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %hadd0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <4 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <4 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <4 x float> %hadd2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hadd3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ ret <4 x float> %result
+}
+
+;
+; v8f32
+;
+
+define <8 x float> @add_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @add_v8f32_01234567(
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 8, i32 10>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 4, i32 6, i32 12, i32 14>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 9, i32 11>
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <4 x i32> <i32 5, i32 7, i32 13, i32 15>
+; SSE-NEXT: [[TMP5:%.*]] = fadd <4 x float> [[TMP1]], [[TMP3]]
+; SSE-NEXT: [[TMP6:%.*]] = fadd <4 x float> [[TMP2]], [[TMP4]]
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <4 x float> [[TMP5]], <4 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: ret <8 x float> [[TMP7]]
+;
+; AVX-LABEL: @add_v8f32_01234567(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; AVX-NEXT: ret <8 x float> [[TMP3]]
+;
+ %a0 = extractelement <8 x float> %a, i32 0
+ %a1 = extractelement <8 x float> %a, i32 1
+ %a2 = extractelement <8 x float> %a, i32 2
+ %a3 = extractelement <8 x float> %a, i32 3
+ %a4 = extractelement <8 x float> %a, i32 4
+ %a5 = extractelement <8 x float> %a, i32 5
+ %a6 = extractelement <8 x float> %a, i32 6
+ %a7 = extractelement <8 x float> %a, i32 7
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %a45 = fadd float %a4, %a5
+ %a67 = fadd float %a6, %a7
+ %b0 = extractelement <8 x float> %b, i32 0
+ %b1 = extractelement <8 x float> %b, i32 1
+ %b2 = extractelement <8 x float> %b, i32 2
+ %b3 = extractelement <8 x float> %b, i32 3
+ %b4 = extractelement <8 x float> %b, i32 4
+ %b5 = extractelement <8 x float> %b, i32 5
+ %b6 = extractelement <8 x float> %b, i32 6
+ %b7 = extractelement <8 x float> %b, i32 7
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %b45 = fadd float %b4, %b5
+ %b67 = fadd float %b6, %b7
+ %hadd0 = insertelement <8 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <8 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <8 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <8 x float> %hadd2, float %b23, i32 3
+ %hadd4 = insertelement <8 x float> %hadd3, float %a45, i32 4
+ %hadd5 = insertelement <8 x float> %hadd4, float %a67, i32 5
+ %hadd6 = insertelement <8 x float> %hadd5, float %b45, i32 6
+ %hadd7 = insertelement <8 x float> %hadd6, float %b67, i32 7
+ %result = shufflevector <8 x float> %hadd7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %result
+}
+
+define <8 x float> @add_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
+; SSE-LABEL: @add_v8f32_012u4567(
+; SSE-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE-NEXT: [[A67:%.*]] = fadd float [[A6]], [[A7]]
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP3:%.*]] = fadd <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[HADD5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP6:%.*]] = fadd <8 x float> [[TMP4]], [[TMP5]]
+; SSE-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE-NEXT: ret <8 x float> [[RESULT]]
+;
+; AVX-LABEL: @add_v8f32_012u4567(
+; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> poison, <8 x i32> <i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 7, i32 poison>
+; AVX-NEXT: [[TMP1:%.*]] = fadd <8 x float> [[A]], [[SHIFT]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = fadd <8 x float> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[HADD5:%.*]] = shufflevector <8 x float> [[TMP4]], <8 x float> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 14, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP7:%.*]] = fadd <8 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HADD5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; AVX-NEXT: ret <8 x float> [[RESULT]]
+;
+ %a0 = extractelement <8 x float> %a, i32 0
+ %a1 = extractelement <8 x float> %a, i32 1
+ %a2 = extractelement <8 x float> %a, i32 2
+ %a3 = extractelement <8 x float> %a, i32 3
+ %a4 = extractelement <8 x float> %a, i32 4
+ %a5 = extractelement <8 x float> %a, i32 5
+ %a6 = extractelement <8 x float> %a, i32 6
+ %a7 = extractelement <8 x float> %a, i32 7
+ %a01 = fadd float %a0, %a1
+ %a23 = fadd float %a2, %a3
+ %a45 = fadd float %a4, %a5
+ %a67 = fadd float %a6, %a7
+ %b0 = extractelement <8 x float> %b, i32 0
+ %b1 = extractelement <8 x float> %b, i32 1
+ %b2 = extractelement <8 x float> %b, i32 2
+ %b3 = extractelement <8 x float> %b, i32 3
+ %b4 = extractelement <8 x float> %b, i32 4
+ %b5 = extractelement <8 x float> %b, i32 5
+ %b6 = extractelement <8 x float> %b, i32 6
+ %b7 = extractelement <8 x float> %b, i32 7
+ %b01 = fadd float %b0, %b1
+ %b23 = fadd float %b2, %b3
+ %b45 = fadd float %b4, %b5
+ %b67 = fadd float %b6, %b7
+ %hadd0 = insertelement <8 x float> poison, float %a01, i32 0
+ %hadd1 = insertelement <8 x float> %hadd0, float %a23, i32 1
+ %hadd2 = insertelement <8 x float> %hadd1, float %b01, i32 2
+ %hadd3 = insertelement <8 x float> %hadd2, float %b23, i32 3
+ %hadd4 = insertelement <8 x float> %hadd3, float %a45, i32 4
+ %hadd5 = insertelement <8 x float> %hadd4, float %a67, i32 5
+ %hadd6 = insertelement <8 x float> %hadd5, float %b45, i32 6
+ %hadd7 = insertelement <8 x float> %hadd6, float %b67, i32 7
+ %result = shufflevector <8 x float> %hadd7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %result
+}
+
+;
+; v2f64
+;
+
+define <2 x double> @add_v2f64_01(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @add_v2f64_01(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[TMP3]]
+;
+ %a0 = extractelement <2 x double> %a, i32 0
+ %a1 = extractelement <2 x double> %a, i32 1
+ %a01 = fadd double %a0, %a1
+ %b0 = extractelement <2 x double> %b, i32 0
+ %b1 = extractelement <2 x double> %b, i32 1
+ %b01 = fadd double %b0, %b1
+ %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
+ %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %result
+}
+
+define <2 x double> @add_v2f64_u1(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @add_v2f64_u1(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[B]], [[SHIFT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: ret <2 x double> [[RESULT]]
+;
+ %a0 = extractelement <2 x double> %a, i32 0
+ %a1 = extractelement <2 x double> %a, i32 1
+ %a01 = fadd double %a0, %a1
+ %b0 = extractelement <2 x double> %b, i32 0
+ %b1 = extractelement <2 x double> %b, i32 1
+ %b01 = fadd double %b0, %b1
+ %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
+ %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
+ ret <2 x double> %result
+}
+
+define <2 x double> @add_v2f64_0u(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @add_v2f64_0u(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = fadd <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: ret <2 x double> [[RESULT]]
+;
+ %a0 = extractelement <2 x double> %a, i32 0
+ %a1 = extractelement <2 x double> %a, i32 1
+ %a01 = fadd double %a0, %a1
+ %b0 = extractelement <2 x double> %b, i32 0
+ %b1 = extractelement <2 x double> %b, i32 1
+ %b01 = fadd double %b0, %b1
+ %hadd0 = insertelement <2 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <2 x double> %hadd0, double %b01, i32 1
+ %result = shufflevector <2 x double> %hadd1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
+ ret <2 x double> %result
+}
+
+;
+; v4f64
+;
+
+define <4 x double> @add_v4f64_0123(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_0123(
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 2, i32 6>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = fadd <2 x double> [[TMP1]], [[TMP3]]
+; SSE-NEXT: [[TMP6:%.*]] = fadd <2 x double> [[TMP2]], [[TMP4]]
+; SSE-NEXT: [[TMP7:%.*]] = shufflevector <2 x double> [[TMP5]], <2 x double> [[TMP6]], <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+; SSE-NEXT: ret <4 x double> [[TMP7]]
+;
+; AVX-LABEL: @add_v4f64_0123(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; AVX-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: ret <4 x double> [[TMP3]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fadd double %a0, %a1
+ %a23 = fadd double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fadd double %b0, %b1
+ %b23 = fadd double %b2, %b3
+ %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+ %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+ %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_u123(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @add_v4f64_u123(
+; SSE2-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; SSE2-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT: ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @add_v4f64_u123(
+; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_u123(
+; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 poison, i32 1, i32 2, i32 6>
+; AVX-NEXT: ret <4 x double> [[RESULT]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fadd double %a0, %a1
+ %a23 = fadd double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fadd double %b0, %b1
+ %b23 = fadd double %b2, %b3
+ %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+ %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+ %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_0u23(
+; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_0u23(
+; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 1, i32 poison, i32 2, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <4 x i32> <i32 0, i32 poison, i32 3, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT: ret <4 x double> [[RESULT]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fadd double %a0, %a1
+ %a23 = fadd double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fadd double %b0, %b1
+ %b23 = fadd double %b2, %b3
+ %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+ %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+ %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_01u3(
+; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT: [[B23:%.*]] = fadd double [[B2]], [[B3]]
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_01u3(
+; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[B]], [[SHIFT]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 poison, i32 6>
+; AVX-NEXT: ret <4 x double> [[RESULT]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fadd double %a0, %a1
+ %a23 = fadd double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fadd double %b0, %b1
+ %b23 = fadd double %b2, %b3
+ %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+ %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+ %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_012u(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @add_v4f64_012u(
+; SSE-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE-NEXT: [[A23:%.*]] = fadd double [[A2]], [[A3]]
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @add_v4f64_012u(
+; AVX-NEXT: [[SHIFT:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 3, i32 poison>
+; AVX-NEXT: [[TMP1:%.*]] = fadd <4 x double> [[A]], [[SHIFT]]
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = fadd <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: [[RESULT:%.*]] = shufflevector <4 x double> [[TMP4]], <4 x double> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 6, i32 poison>
+; AVX-NEXT: ret <4 x double> [[RESULT]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fadd double %a0, %a1
+ %a23 = fadd double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fadd double %b0, %b1
+ %b23 = fadd double %b2, %b3
+ %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+ %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+ %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+ ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @add_v4f64_uu23(
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE2-NEXT: [[TMP3:%.*]] = fadd <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT: ret <4 x double> [[RESULT1]]
+;
+; SSE4-LABEL: @add_v4f64_uu23(
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT: [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: ret <4 x double> [[RESULT1]]
+;
+; AVX-LABEL: @add_v4f64_uu23(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT: [[RESULT1:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: ret <4 x double> [[RESULT1]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fadd double %a0, %a1
+ %a23 = fadd double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fadd double %b0, %b1
+ %b23 = fadd double %b2, %b3
+ %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+ %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+ %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @add_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @add_v4f64_01uu(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = fadd <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x double> [[TMP3]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fadd double %a0, %a1
+ %a23 = fadd double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fadd double %b0, %b1
+ %b23 = fadd double %b2, %b3
+ %hadd0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hadd1 = insertelement <4 x double> %hadd0, double %b01, i32 1
+ %hadd2 = insertelement <4 x double> %hadd1, double %a23, i32 2
+ %hadd3 = insertelement <4 x double> %hadd2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hadd3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ ret <4 x double> %result
+}
More information about the llvm-commits
mailing list