[llvm] 67652a3 - [PhaseOrdering][X86] Add horizontal-sub test coverage for #34072
Simon Pilgrim via llvm-commits
llvm-commits at lists.llvm.org
Mon Jan 6 04:21:13 PST 2025
Author: Simon Pilgrim
Date: 2025-01-06T12:15:21Z
New Revision: 67652a3d9fa80dcd940ce8863e32d0a274f7f8e1
URL: https://github.com/llvm/llvm-project/commit/67652a3d9fa80dcd940ce8863e32d0a274f7f8e1
DIFF: https://github.com/llvm/llvm-project/commit/67652a3d9fa80dcd940ce8863e32d0a274f7f8e1.diff
LOG: [PhaseOrdering][X86] Add horizontal-sub test coverage for #34072
Matches the existing horizontal-add tests, with the additional non-commutable constraint
Added:
llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
Modified:
Removed:
################################################################################
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
new file mode 100644
index 00000000000000..db8a774ba20f07
--- /dev/null
+++ b/llvm/test/Transforms/PhaseOrdering/X86/hsub.ll
@@ -0,0 +1,1155 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -O3 -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64 | FileCheck %s --check-prefixes=CHECK,SSE,SSE2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE,SSE4
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX,AVX2
+; RUN: opt < %s -passes="default<O3>" -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX,AVX512
+
+; PR34072 - failure to canonicalize to (sub (shuffle a, b),(shuffle a, b)) for optimal horizontal sub patterns (with undemanded elements)
+
+;
+; v8i16
+;
+
+define <8 x i16> @sub_v8i16_01234567(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: @sub_v8i16_01234567(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x i16> [[TMP3]]
+;
+ %a0 = extractelement <8 x i16> %a, i32 0
+ %a1 = extractelement <8 x i16> %a, i32 1
+ %a2 = extractelement <8 x i16> %a, i32 2
+ %a3 = extractelement <8 x i16> %a, i32 3
+ %a4 = extractelement <8 x i16> %a, i32 4
+ %a5 = extractelement <8 x i16> %a, i32 5
+ %a6 = extractelement <8 x i16> %a, i32 6
+ %a7 = extractelement <8 x i16> %a, i32 7
+ %a01 = sub i16 %a0, %a1
+ %a23 = sub i16 %a2, %a3
+ %a45 = sub i16 %a4, %a5
+ %a67 = sub i16 %a6, %a7
+ %b0 = extractelement <8 x i16> %b, i32 0
+ %b1 = extractelement <8 x i16> %b, i32 1
+ %b2 = extractelement <8 x i16> %b, i32 2
+ %b3 = extractelement <8 x i16> %b, i32 3
+ %b4 = extractelement <8 x i16> %b, i32 4
+ %b5 = extractelement <8 x i16> %b, i32 5
+ %b6 = extractelement <8 x i16> %b, i32 6
+ %b7 = extractelement <8 x i16> %b, i32 7
+ %b01 = sub i16 %b0, %b1
+ %b23 = sub i16 %b2, %b3
+ %b45 = sub i16 %b4, %b5
+ %b67 = sub i16 %b6, %b7
+ %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+ %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+ %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+ %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+ %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+ %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+ %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+ %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+ %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %result
+}
+
+define <8 x i16> @sub_v8i16_u1234567(<8 x i16> %a, <8 x i16> %b) {
+; SSE2-LABEL: @sub_v8i16_u1234567(
+; SSE2-NEXT: [[A2:%.*]] = extractelement <8 x i16> [[A:%.*]], i64 2
+; SSE2-NEXT: [[A3:%.*]] = extractelement <8 x i16> [[A]], i64 3
+; SSE2-NEXT: [[A4:%.*]] = extractelement <8 x i16> [[A]], i64 4
+; SSE2-NEXT: [[A5:%.*]] = extractelement <8 x i16> [[A]], i64 5
+; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x i16> [[A]], i64 6
+; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x i16> [[A]], i64 7
+; SSE2-NEXT: [[A23:%.*]] = sub i16 [[A2]], [[A3]]
+; SSE2-NEXT: [[A45:%.*]] = sub i16 [[A4]], [[A5]]
+; SSE2-NEXT: [[A67:%.*]] = sub i16 [[A6]], [[A7]]
+; SSE2-NEXT: [[HSUB1:%.*]] = insertelement <8 x i16> poison, i16 [[A23]], i64 1
+; SSE2-NEXT: [[HSUB2:%.*]] = insertelement <8 x i16> [[HSUB1]], i16 [[A45]], i64 2
+; SSE2-NEXT: [[HSUB3:%.*]] = insertelement <8 x i16> [[HSUB2]], i16 [[A67]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[B:%.*]], <8 x i16> poison, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[B]], <8 x i16> poison, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = sub <8 x i16> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x i16> [[HSUB3]], <8 x i16> [[TMP3]], <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 8, i32 9, i32 10, i32 11>
+; SSE2-NEXT: ret <8 x i16> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8i16_u1234567(
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; SSE4-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; SSE4-NEXT: [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; SSE4-NEXT: ret <8 x i16> [[TMP7]]
+;
+; AVX-LABEL: @sub_v8i16_u1234567(
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[A:%.*]], <8 x i16> [[B:%.*]], <8 x i32> <i32 poison, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i16> [[A]], <8 x i16> [[B]], <8 x i32> <i32 poison, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+; AVX-NEXT: [[TMP7:%.*]] = sub <8 x i16> [[TMP5]], [[TMP6]]
+; AVX-NEXT: ret <8 x i16> [[TMP7]]
+;
+ %a0 = extractelement <8 x i16> %a, i32 0
+ %a1 = extractelement <8 x i16> %a, i32 1
+ %a2 = extractelement <8 x i16> %a, i32 2
+ %a3 = extractelement <8 x i16> %a, i32 3
+ %a4 = extractelement <8 x i16> %a, i32 4
+ %a5 = extractelement <8 x i16> %a, i32 5
+ %a6 = extractelement <8 x i16> %a, i32 6
+ %a7 = extractelement <8 x i16> %a, i32 7
+ %a01 = sub i16 %a0, %a1
+ %a23 = sub i16 %a2, %a3
+ %a45 = sub i16 %a4, %a5
+ %a67 = sub i16 %a6, %a7
+ %b0 = extractelement <8 x i16> %b, i32 0
+ %b1 = extractelement <8 x i16> %b, i32 1
+ %b2 = extractelement <8 x i16> %b, i32 2
+ %b3 = extractelement <8 x i16> %b, i32 3
+ %b4 = extractelement <8 x i16> %b, i32 4
+ %b5 = extractelement <8 x i16> %b, i32 5
+ %b6 = extractelement <8 x i16> %b, i32 6
+ %b7 = extractelement <8 x i16> %b, i32 7
+ %b01 = sub i16 %b0, %b1
+ %b23 = sub i16 %b2, %b3
+ %b45 = sub i16 %b4, %b5
+ %b67 = sub i16 %b6, %b7
+ %hsub0 = insertelement <8 x i16> poison, i16 %a01, i32 0
+ %hsub1 = insertelement <8 x i16> %hsub0, i16 %a23, i32 1
+ %hsub2 = insertelement <8 x i16> %hsub1, i16 %a45, i32 2
+ %hsub3 = insertelement <8 x i16> %hsub2, i16 %a67, i32 3
+ %hsub4 = insertelement <8 x i16> %hsub3, i16 %b01, i32 4
+ %hsub5 = insertelement <8 x i16> %hsub4, i16 %b23, i32 5
+ %hsub6 = insertelement <8 x i16> %hsub5, i16 %b45, i32 6
+ %hsub7 = insertelement <8 x i16> %hsub6, i16 %b67, i32 7
+ %result = shufflevector <8 x i16> %hsub7, <8 x i16> %a, <8 x i32> <i32 poison, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %result
+}
+
+;
+; v4i32
+;
+
+define <4 x i32> @sub_v4i32_0123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0123(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_u123(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_u123(
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_0u23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_0u23(
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01u3(<4 x i32> %a, <4 x i32> %b) {
+; SSE2-LABEL: @sub_v4i32_01u3(
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE2-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: ret <4 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4i32_01u3(
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: ret <4 x i32> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4i32_01u3(
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX2-NEXT: ret <4 x i32> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4i32_01u3(
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX512-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; AVX512-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_012u(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_012u(
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; CHECK-NEXT: [[TMP4:%.*]] = sub <4 x i32> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret <4 x i32> [[TMP4]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_uu23(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_uu23(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[B:%.*]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[B]], <4 x i32> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT: [[RESULT1:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x i32> [[RESULT1]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+ ret <4 x i32> %result
+}
+
+define <4 x i32> @sub_v4i32_01uu(<4 x i32> %a, <4 x i32> %b) {
+; CHECK-LABEL: @sub_v4i32_01uu(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[A:%.*]], <4 x i32> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[A]], <4 x i32> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = sub <4 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x i32> [[TMP3]]
+;
+ %a0 = extractelement <4 x i32> %a, i32 0
+ %a1 = extractelement <4 x i32> %a, i32 1
+ %a2 = extractelement <4 x i32> %a, i32 2
+ %a3 = extractelement <4 x i32> %a, i32 3
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %b0 = extractelement <4 x i32> %b, i32 0
+ %b1 = extractelement <4 x i32> %b, i32 1
+ %b2 = extractelement <4 x i32> %b, i32 2
+ %b3 = extractelement <4 x i32> %b, i32 3
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %hsub0 = insertelement <4 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <4 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <4 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <4 x i32> %hsub2, i32 %b23, i32 3
+ %result = shufflevector <4 x i32> %hsub3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ ret <4 x i32> %result
+}
+
+;
+; v8i32
+;
+
+define <8 x i32> @sub_v8i32_01234567(<8 x i32> %a, <8 x i32> %b) {
+; CHECK-LABEL: @sub_v8i32_01234567(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x i32> [[TMP3]]
+;
+ %a0 = extractelement <8 x i32> %a, i32 0
+ %a1 = extractelement <8 x i32> %a, i32 1
+ %a2 = extractelement <8 x i32> %a, i32 2
+ %a3 = extractelement <8 x i32> %a, i32 3
+ %a4 = extractelement <8 x i32> %a, i32 4
+ %a5 = extractelement <8 x i32> %a, i32 5
+ %a6 = extractelement <8 x i32> %a, i32 6
+ %a7 = extractelement <8 x i32> %a, i32 7
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %a45 = sub i32 %a4, %a5
+ %a67 = sub i32 %a6, %a7
+ %b0 = extractelement <8 x i32> %b, i32 0
+ %b1 = extractelement <8 x i32> %b, i32 1
+ %b2 = extractelement <8 x i32> %b, i32 2
+ %b3 = extractelement <8 x i32> %b, i32 3
+ %b4 = extractelement <8 x i32> %b, i32 4
+ %b5 = extractelement <8 x i32> %b, i32 5
+ %b6 = extractelement <8 x i32> %b, i32 6
+ %b7 = extractelement <8 x i32> %b, i32 7
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %b45 = sub i32 %b4, %b5
+ %b67 = sub i32 %b6, %b7
+ %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+ %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+ %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+ %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+ %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+ %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %result
+}
+
+define <8 x i32> @sub_v8i32_01234u67(<8 x i32> %a, <8 x i32> %b) {
+; SSE2-LABEL: @sub_v8i32_01234u67(
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; SSE2-NEXT: [[TMP4:%.*]] = sub <8 x i32> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: ret <8 x i32> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v8i32_01234u67(
+; SSE4-NEXT: [[A4:%.*]] = extractelement <8 x i32> [[A:%.*]], i64 4
+; SSE4-NEXT: [[A5:%.*]] = extractelement <8 x i32> [[A]], i64 5
+; SSE4-NEXT: [[A45:%.*]] = sub i32 [[A4]], [[A5]]
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = sub <8 x i32> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[HSUB4:%.*]] = insertelement <8 x i32> [[TMP3]], i32 [[A45]], i64 4
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[B]], <8 x i32> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = sub <8 x i32> [[TMP4]], [[TMP5]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x i32> [[HSUB4]], <8 x i32> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 8, i32 9>
+; SSE4-NEXT: ret <8 x i32> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8i32_01234u67(
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[A:%.*]], <8 x i32> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 poison, i32 12, i32 14>
+; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x i32> [[A]], <8 x i32> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 poison, i32 13, i32 15>
+; AVX-NEXT: [[TMP7:%.*]] = sub <8 x i32> [[TMP5]], [[TMP6]]
+; AVX-NEXT: ret <8 x i32> [[TMP7]]
+;
+ %a0 = extractelement <8 x i32> %a, i32 0
+ %a1 = extractelement <8 x i32> %a, i32 1
+ %a2 = extractelement <8 x i32> %a, i32 2
+ %a3 = extractelement <8 x i32> %a, i32 3
+ %a4 = extractelement <8 x i32> %a, i32 4
+ %a5 = extractelement <8 x i32> %a, i32 5
+ %a6 = extractelement <8 x i32> %a, i32 6
+ %a7 = extractelement <8 x i32> %a, i32 7
+ %a01 = sub i32 %a0, %a1
+ %a23 = sub i32 %a2, %a3
+ %a45 = sub i32 %a4, %a5
+ %a67 = sub i32 %a6, %a7
+ %b0 = extractelement <8 x i32> %b, i32 0
+ %b1 = extractelement <8 x i32> %b, i32 1
+ %b2 = extractelement <8 x i32> %b, i32 2
+ %b3 = extractelement <8 x i32> %b, i32 3
+ %b4 = extractelement <8 x i32> %b, i32 4
+ %b5 = extractelement <8 x i32> %b, i32 5
+ %b6 = extractelement <8 x i32> %b, i32 6
+ %b7 = extractelement <8 x i32> %b, i32 7
+ %b01 = sub i32 %b0, %b1
+ %b23 = sub i32 %b2, %b3
+ %b45 = sub i32 %b4, %b5
+ %b67 = sub i32 %b6, %b7
+ %hsub0 = insertelement <8 x i32> poison, i32 %a01, i32 0
+ %hsub1 = insertelement <8 x i32> %hsub0, i32 %a23, i32 1
+ %hsub2 = insertelement <8 x i32> %hsub1, i32 %b01, i32 2
+ %hsub3 = insertelement <8 x i32> %hsub2, i32 %b23, i32 3
+ %hsub4 = insertelement <8 x i32> %hsub3, i32 %a45, i32 4
+ %hsub5 = insertelement <8 x i32> %hsub4, i32 %a67, i32 5
+ %hsub6 = insertelement <8 x i32> %hsub5, i32 %b45, i32 6
+ %hsub7 = insertelement <8 x i32> %hsub6, i32 %b67, i32 7
+ %result = shufflevector <8 x i32> %hsub7, <8 x i32> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 poison, i32 6, i32 7>
+ ret <8 x i32> %result
+}
+
+;
+; v4f32
+;
+
+define <4 x float> @sub_v4f32_0123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0123(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_u123(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_u123(
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 poison, i32 2, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 poison, i32 3, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_0u23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_0u23(
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 4, i32 6>
+; CHECK-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 poison, i32 5, i32 7>
+; CHECK-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; CHECK-NEXT: ret <4 x float> [[TMP4]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01u3(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @sub_v4f32_01u3(
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE2-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: ret <4 x float> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f32_01u3(
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; SSE4-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: ret <4 x float> [[TMP4]]
+;
+; AVX2-LABEL: @sub_v4f32_01u3(
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX2-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT: ret <4 x float> [[TMP4]]
+;
+; AVX512-LABEL: @sub_v4f32_01u3(
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 poison, i32 6>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 poison, i32 7>
+; AVX512-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT: ret <4 x float> [[TMP4]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_012u(<4 x float> %a, <4 x float> %b) {
+; SSE2-LABEL: @sub_v4f32_012u(
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; SSE2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; SSE2-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE2-NEXT: ret <4 x float> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f32_012u(
+; SSE4-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[B]], [[SHIFT]]
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; SSE4-NEXT: ret <4 x float> [[RESULT]]
+;
+; AVX2-LABEL: @sub_v4f32_012u(
+; AVX2-NEXT: [[SHIFT:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 1, i32 poison, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP1:%.*]] = fsub <4 x float> [[B]], [[SHIFT]]
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; AVX2-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX2-NEXT: [[RESULT:%.*]] = shufflevector <4 x float> [[TMP4]], <4 x float> [[TMP1]], <4 x i32> <i32 0, i32 1, i32 4, i32 poison>
+; AVX2-NEXT: ret <4 x float> [[RESULT]]
+;
+; AVX512-LABEL: @sub_v4f32_012u(
+; AVX512-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> [[B:%.*]], <4 x i32> <i32 0, i32 2, i32 4, i32 poison>
+; AVX512-NEXT: [[TMP3:%.*]] = shufflevector <4 x float> [[A]], <4 x float> [[B]], <4 x i32> <i32 1, i32 3, i32 5, i32 poison>
+; AVX512-NEXT: [[TMP4:%.*]] = fsub <4 x float> [[TMP2]], [[TMP3]]
+; AVX512-NEXT: ret <4 x float> [[TMP4]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+ ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_uu23(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_uu23(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[B:%.*]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[B]], <4 x float> poison, <4 x i32> <i32 poison, i32 poison, i32 1, i32 3>
+; CHECK-NEXT: [[RESULT1:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x float> [[RESULT1]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+ ret <4 x float> %result
+}
+
+define <4 x float> @sub_v4f32_01uu(<4 x float> %a, <4 x float> %b) {
+; CHECK-LABEL: @sub_v4f32_01uu(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[A:%.*]], <4 x float> poison, <4 x i32> <i32 0, i32 2, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[A]], <4 x float> poison, <4 x i32> <i32 1, i32 3, i32 poison, i32 poison>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x float> [[TMP3]]
+;
+ %a0 = extractelement <4 x float> %a, i32 0
+ %a1 = extractelement <4 x float> %a, i32 1
+ %a2 = extractelement <4 x float> %a, i32 2
+ %a3 = extractelement <4 x float> %a, i32 3
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %b0 = extractelement <4 x float> %b, i32 0
+ %b1 = extractelement <4 x float> %b, i32 1
+ %b2 = extractelement <4 x float> %b, i32 2
+ %b3 = extractelement <4 x float> %b, i32 3
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %hsub0 = insertelement <4 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <4 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <4 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <4 x float> %hsub2, float %b23, i32 3
+ %result = shufflevector <4 x float> %hsub3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ ret <4 x float> %result
+}
+
+;
+; v8f32
+;
+
+define <8 x float> @sub_v8f32_01234567(<8 x float> %a, <8 x float> %b) {
+; CHECK-LABEL: @sub_v8f32_01234567(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 10, i32 4, i32 6, i32 12, i32 14>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 11, i32 5, i32 7, i32 13, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <8 x float> [[TMP3]]
+;
+ %a0 = extractelement <8 x float> %a, i32 0
+ %a1 = extractelement <8 x float> %a, i32 1
+ %a2 = extractelement <8 x float> %a, i32 2
+ %a3 = extractelement <8 x float> %a, i32 3
+ %a4 = extractelement <8 x float> %a, i32 4
+ %a5 = extractelement <8 x float> %a, i32 5
+ %a6 = extractelement <8 x float> %a, i32 6
+ %a7 = extractelement <8 x float> %a, i32 7
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %a45 = fsub float %a4, %a5
+ %a67 = fsub float %a6, %a7
+ %b0 = extractelement <8 x float> %b, i32 0
+ %b1 = extractelement <8 x float> %b, i32 1
+ %b2 = extractelement <8 x float> %b, i32 2
+ %b3 = extractelement <8 x float> %b, i32 3
+ %b4 = extractelement <8 x float> %b, i32 4
+ %b5 = extractelement <8 x float> %b, i32 5
+ %b6 = extractelement <8 x float> %b, i32 6
+ %b7 = extractelement <8 x float> %b, i32 7
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %b45 = fsub float %b4, %b5
+ %b67 = fsub float %b6, %b7
+ %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+ %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+ %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+ %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+ %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+ %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %result
+}
+
+define <8 x float> @sub_v8f32_012u4567(<8 x float> %a, <8 x float> %b) {
+; SSE2-LABEL: @sub_v8f32_012u4567(
+; SSE2-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE2-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE2-NEXT: [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[B:%.*]], <8 x float> poison, <2 x i32> <i32 4, i32 6>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <2 x i32> <i32 5, i32 7>
+; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x float> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE2-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP6]], float [[A67]], i64 5
+; SSE2-NEXT: [[TMP7:%.*]] = shufflevector <2 x float> [[TMP3]], <2 x float> poison, <8 x i32> <i32 0, i32 1, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE2-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP7]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE2-NEXT: ret <8 x float> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v8f32_012u4567(
+; SSE4-NEXT: [[A6:%.*]] = extractelement <8 x float> [[A:%.*]], i64 6
+; SSE4-NEXT: [[A7:%.*]] = extractelement <8 x float> [[A]], i64 7
+; SSE4-NEXT: [[A67:%.*]] = fsub float [[A6]], [[A7]]
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <8 x float> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[HSUB5:%.*]] = insertelement <8 x float> [[TMP3]], float [[A67]], i64 5
+; SSE4-NEXT: [[TMP4:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 4, i32 6, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[B]], <8 x float> poison, <8 x i32> <i32 5, i32 7, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP6:%.*]] = fsub <8 x float> [[TMP4]], [[TMP5]]
+; SSE4-NEXT: [[RESULT:%.*]] = shufflevector <8 x float> [[HSUB5]], <8 x float> [[TMP6]], <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 8, i32 9>
+; SSE4-NEXT: ret <8 x float> [[RESULT]]
+;
+; AVX-LABEL: @sub_v8f32_012u4567(
+; AVX-NEXT: [[TMP5:%.*]] = shufflevector <8 x float> [[A:%.*]], <8 x float> [[B:%.*]], <8 x i32> <i32 0, i32 2, i32 8, i32 poison, i32 4, i32 6, i32 12, i32 14>
+; AVX-NEXT: [[TMP6:%.*]] = shufflevector <8 x float> [[A]], <8 x float> [[B]], <8 x i32> <i32 1, i32 3, i32 9, i32 poison, i32 5, i32 7, i32 13, i32 15>
+; AVX-NEXT: [[TMP7:%.*]] = fsub <8 x float> [[TMP5]], [[TMP6]]
+; AVX-NEXT: ret <8 x float> [[TMP7]]
+;
+ %a0 = extractelement <8 x float> %a, i32 0
+ %a1 = extractelement <8 x float> %a, i32 1
+ %a2 = extractelement <8 x float> %a, i32 2
+ %a3 = extractelement <8 x float> %a, i32 3
+ %a4 = extractelement <8 x float> %a, i32 4
+ %a5 = extractelement <8 x float> %a, i32 5
+ %a6 = extractelement <8 x float> %a, i32 6
+ %a7 = extractelement <8 x float> %a, i32 7
+ %a01 = fsub float %a0, %a1
+ %a23 = fsub float %a2, %a3
+ %a45 = fsub float %a4, %a5
+ %a67 = fsub float %a6, %a7
+ %b0 = extractelement <8 x float> %b, i32 0
+ %b1 = extractelement <8 x float> %b, i32 1
+ %b2 = extractelement <8 x float> %b, i32 2
+ %b3 = extractelement <8 x float> %b, i32 3
+ %b4 = extractelement <8 x float> %b, i32 4
+ %b5 = extractelement <8 x float> %b, i32 5
+ %b6 = extractelement <8 x float> %b, i32 6
+ %b7 = extractelement <8 x float> %b, i32 7
+ %b01 = fsub float %b0, %b1
+ %b23 = fsub float %b2, %b3
+ %b45 = fsub float %b4, %b5
+ %b67 = fsub float %b6, %b7
+ %hsub0 = insertelement <8 x float> poison, float %a01, i32 0
+ %hsub1 = insertelement <8 x float> %hsub0, float %a23, i32 1
+ %hsub2 = insertelement <8 x float> %hsub1, float %b01, i32 2
+ %hsub3 = insertelement <8 x float> %hsub2, float %b23, i32 3
+ %hsub4 = insertelement <8 x float> %hsub3, float %a45, i32 4
+ %hsub5 = insertelement <8 x float> %hsub4, float %a67, i32 5
+ %hsub6 = insertelement <8 x float> %hsub5, float %b45, i32 6
+ %hsub7 = insertelement <8 x float> %hsub6, float %b67, i32 7
+ %result = shufflevector <8 x float> %hsub7, <8 x float> %a, <8 x i32> <i32 0, i32 1, i32 2, i32 poison, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %result
+}
+
+;
+; v2f64
+;
+
+define <2 x double> @sub_v2f64_01(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_01(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> [[B:%.*]], <2 x i32> <i32 0, i32 2>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <2 x double> [[A]], <2 x double> [[B]], <2 x i32> <i32 1, i32 3>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <2 x double> [[TMP3]]
+;
+ %a0 = extractelement <2 x double> %a, i32 0
+ %a1 = extractelement <2 x double> %a, i32 1
+ %a01 = fsub double %a0, %a1
+ %b0 = extractelement <2 x double> %b, i32 0
+ %b1 = extractelement <2 x double> %b, i32 1
+ %b01 = fsub double %b0, %b1
+ %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+ %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 1>
+ ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_u1(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_u1(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[B:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[B]], [[SHIFT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 poison, i32 0>
+; CHECK-NEXT: ret <2 x double> [[RESULT]]
+;
+ %a0 = extractelement <2 x double> %a, i32 0
+ %a1 = extractelement <2 x double> %a, i32 1
+ %a01 = fsub double %a0, %a1
+ %b0 = extractelement <2 x double> %b, i32 0
+ %b1 = extractelement <2 x double> %b, i32 1
+ %b01 = fsub double %b0, %b1
+ %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+ %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 poison, i32 1>
+ ret <2 x double> %result
+}
+
+define <2 x double> @sub_v2f64_0u(<2 x double> %a, <2 x double> %b) {
+; CHECK-LABEL: @sub_v2f64_0u(
+; CHECK-NEXT: [[SHIFT:%.*]] = shufflevector <2 x double> [[A:%.*]], <2 x double> poison, <2 x i32> <i32 1, i32 poison>
+; CHECK-NEXT: [[TMP1:%.*]] = fsub <2 x double> [[A]], [[SHIFT]]
+; CHECK-NEXT: [[RESULT:%.*]] = shufflevector <2 x double> [[TMP1]], <2 x double> poison, <2 x i32> <i32 0, i32 poison>
+; CHECK-NEXT: ret <2 x double> [[RESULT]]
+;
+ %a0 = extractelement <2 x double> %a, i32 0
+ %a1 = extractelement <2 x double> %a, i32 1
+ %a01 = fsub double %a0, %a1
+ %b0 = extractelement <2 x double> %b, i32 0
+ %b1 = extractelement <2 x double> %b, i32 1
+ %b01 = fsub double %b0, %b1
+ %hsub0 = insertelement <2 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <2 x double> %hsub0, double %b01, i32 1
+ %result = shufflevector <2 x double> %hsub1, <2 x double> %a, <2 x i32> <i32 0, i32 poison>
+ ret <2 x double> %result
+}
+
+;
+; v4f64
+;
+
+define <4 x double> @sub_v4f64_0123(<4 x double> %a, <4 x double> %b) {
+; CHECK-LABEL: @sub_v4f64_0123(
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+; CHECK-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: ret <4 x double> [[TMP3]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fsub double %a0, %a1
+ %a23 = fsub double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fsub double %b0, %b1
+ %b23 = fsub double %b2, %b3
+ %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+ %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+ %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_u123(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_u123(
+; SSE2-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <2 x i32> <i32 0, i32 6>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <2 x i32> <i32 1, i32 7>
+; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 0, i32 1, i32 poison>
+; SSE2-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT: ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_u123(
+; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_u123(
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[B:%.*]], <4 x double> [[A:%.*]], <4 x i32> <i32 poison, i32 0, i32 6, i32 2>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[B]], <4 x double> [[A]], <4 x i32> <i32 poison, i32 1, i32 7, i32 3>
+; AVX-NEXT: [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: ret <4 x double> [[TMP4]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fsub double %a0, %a1
+ %a23 = fsub double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fsub double %b0, %b1
+ %b23 = fsub double %b2, %b3
+ %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+ %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+ %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 1, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_0u23(<4 x double> %a, <4 x double> %b) {
+; SSE-LABEL: @sub_v4f64_0u23(
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> poison, <2 x i32> <i32 0, i32 2>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> poison, <2 x i32> <i32 1, i32 3>
+; SSE-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 poison, i32 1, i32 poison>
+; SSE-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_0u23(
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 poison, i32 2, i32 6>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 poison, i32 3, i32 7>
+; AVX-NEXT: [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: ret <4 x double> [[TMP4]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fsub double %a0, %a1
+ %a23 = fsub double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fsub double %b0, %b1
+ %b23 = fsub double %b2, %b3
+ %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+ %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+ %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 poison, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01u3(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01u3(
+; SSE2-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE2-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[B23]], i64 3
+; SSE2-NEXT: ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_01u3(
+; SSE4-NEXT: [[B2:%.*]] = extractelement <4 x double> [[B:%.*]], i64 2
+; SSE4-NEXT: [[B3:%.*]] = extractelement <4 x double> [[B]], i64 3
+; SSE4-NEXT: [[B23:%.*]] = fsub double [[B2]], [[B3]]
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[B23]], i64 3
+; SSE4-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_01u3(
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 6>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 7>
+; AVX-NEXT: [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: ret <4 x double> [[TMP4]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fsub double %a0, %a1
+ %a23 = fsub double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fsub double %b0, %b1
+ %b23 = fsub double %b2, %b3
+ %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+ %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+ %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_012u(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_012u(
+; SSE2-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE2-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE2-NEXT: [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP4]], double [[A23]], i64 2
+; SSE2-NEXT: ret <4 x double> [[RESULT]]
+;
+; SSE4-LABEL: @sub_v4f64_012u(
+; SSE4-NEXT: [[A2:%.*]] = extractelement <4 x double> [[A:%.*]], i64 2
+; SSE4-NEXT: [[A3:%.*]] = extractelement <4 x double> [[A]], i64 3
+; SSE4-NEXT: [[A23:%.*]] = fsub double [[A2]], [[A3]]
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: [[RESULT:%.*]] = insertelement <4 x double> [[TMP3]], double [[A23]], i64 2
+; SSE4-NEXT: ret <4 x double> [[RESULT]]
+;
+; AVX-LABEL: @sub_v4f64_012u(
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 2, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 3, i32 poison>
+; AVX-NEXT: [[TMP4:%.*]] = fsub <4 x double> [[TMP2]], [[TMP3]]
+; AVX-NEXT: ret <4 x double> [[TMP4]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fsub double %a0, %a1
+ %a23 = fsub double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fsub double %b0, %b1
+ %b23 = fsub double %b2, %b3
+ %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+ %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+ %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 poison>
+ ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_uu23(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_uu23(
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 2, i32 6>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 3, i32 7>
+; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[RESULT1:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 poison, i32 poison, i32 0, i32 1>
+; SSE2-NEXT: ret <4 x double> [[RESULT1]]
+;
+; SSE4-LABEL: @sub_v4f64_uu23(
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; SSE4-NEXT: [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: ret <4 x double> [[RESULT1]]
+;
+; AVX-LABEL: @sub_v4f64_uu23(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 poison, i32 poison, i32 2, i32 6>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 poison, i32 poison, i32 3, i32 7>
+; AVX-NEXT: [[RESULT1:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: ret <4 x double> [[RESULT1]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fsub double %a0, %a1
+ %a23 = fsub double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fsub double %b0, %b1
+ %b23 = fsub double %b2, %b3
+ %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+ %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+ %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 poison, i32 poison, i32 2, i32 3>
+ ret <4 x double> %result
+}
+
+define <4 x double> @sub_v4f64_01uu(<4 x double> %a, <4 x double> %b) {
+; SSE2-LABEL: @sub_v4f64_01uu(
+; SSE2-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <2 x i32> <i32 0, i32 4>
+; SSE2-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <2 x i32> <i32 1, i32 5>
+; SSE2-NEXT: [[TMP3:%.*]] = fsub <2 x double> [[TMP1]], [[TMP2]]
+; SSE2-NEXT: [[TMP4:%.*]] = shufflevector <2 x double> [[TMP3]], <2 x double> poison, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+; SSE2-NEXT: ret <4 x double> [[TMP4]]
+;
+; SSE4-LABEL: @sub_v4f64_01uu(
+; SSE4-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; SSE4-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; SSE4-NEXT: ret <4 x double> [[TMP3]]
+;
+; AVX-LABEL: @sub_v4f64_01uu(
+; AVX-NEXT: [[TMP1:%.*]] = shufflevector <4 x double> [[A:%.*]], <4 x double> [[B:%.*]], <4 x i32> <i32 0, i32 4, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP2:%.*]] = shufflevector <4 x double> [[A]], <4 x double> [[B]], <4 x i32> <i32 1, i32 5, i32 poison, i32 poison>
+; AVX-NEXT: [[TMP3:%.*]] = fsub <4 x double> [[TMP1]], [[TMP2]]
+; AVX-NEXT: ret <4 x double> [[TMP3]]
+;
+ %a0 = extractelement <4 x double> %a, i32 0
+ %a1 = extractelement <4 x double> %a, i32 1
+ %a2 = extractelement <4 x double> %a, i32 2
+ %a3 = extractelement <4 x double> %a, i32 3
+ %a01 = fsub double %a0, %a1
+ %a23 = fsub double %a2, %a3
+ %b0 = extractelement <4 x double> %b, i32 0
+ %b1 = extractelement <4 x double> %b, i32 1
+ %b2 = extractelement <4 x double> %b, i32 2
+ %b3 = extractelement <4 x double> %b, i32 3
+ %b01 = fsub double %b0, %b1
+ %b23 = fsub double %b2, %b3
+ %hsub0 = insertelement <4 x double> poison, double %a01, i32 0
+ %hsub1 = insertelement <4 x double> %hsub0, double %b01, i32 1
+ %hsub2 = insertelement <4 x double> %hsub1, double %a23, i32 2
+ %hsub3 = insertelement <4 x double> %hsub2, double %b23, i32 3
+ %result = shufflevector <4 x double> %hsub3, <4 x double> %a, <4 x i32> <i32 0, i32 1, i32 poison, i32 poison>
+ ret <4 x double> %result
+}
More information about the llvm-commits
mailing list