[llvm] [VectorCombine] Handle shuffle of selects (PR #128032)
via llvm-commits
llvm-commits at lists.llvm.org
Sat Feb 22 05:02:16 PST 2025
https://github.com/ParkHanbum updated https://github.com/llvm/llvm-project/pull/128032
>From 28917e8eccda94d82c0d6c878f730b0539569e98 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Fri, 21 Feb 2025 02:13:11 +0900
Subject: [PATCH 1/3] add pre-test
---
.../VectorCombine/X86/shuffle-of-selects.ll | 582 ++++++++++++++++++
1 file changed, 582 insertions(+)
create mode 100644 llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
new file mode 100644
index 0000000000000..91a650f751902
--- /dev/null
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
@@ -0,0 +1,582 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,SSE
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,AVX2
+; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
+
+define <8 x i16> @src_v4tov8_i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %z) {
+; CHECK-LABEL: define <8 x i16> @src_v4tov8_i16(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i16> [[Y:%.*]], <4 x i16> [[Z:%.*]]) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i16> [[X]], [[Y]]
+; CHECK-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i16> [[Y]], [[Z]]
+; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i16> [[X]], <4 x i16> [[Z]]
+; CHECK-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i16> [[Y]], <4 x i16> [[X]]
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[SELECT_XZ]], <4 x i16> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; CHECK-NEXT: ret <8 x i16> [[RES]]
+;
+ %cmp.xy = icmp slt <4 x i16> %x, %y
+ %cmp.yz = icmp slt <4 x i16> %y, %z
+ %select.xz = select <4 x i1> %cmp.xy, <4 x i16> %x, <4 x i16> %z
+ %select.yx = select <4 x i1> %cmp.yz, <4 x i16> %y, <4 x i16> %x
+ %res = shufflevector <4 x i16> %select.xz, <4 x i16> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i16> %res
+}
+
+define <16 x i16> @src_v8tov16_i16(<8 x i16> %x, <8 x i16> %y, <8 x i16> %z) {
+; SSE-LABEL: define <16 x i16> @src_v8tov16_i16(
+; SSE-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]], <8 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i16> [[X]], <8 x i16> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i16> [[Y]], <8 x i16> [[Z]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[TMP3:%.*]] = icmp slt <16 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i16> [[X]], <8 x i16> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i16> [[Z]], <8 x i16> [[X]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[RES:%.*]] = select <16 x i1> [[TMP3]], <16 x i16> [[TMP4]], <16 x i16> [[TMP5]]
+; SSE-NEXT: ret <16 x i16> [[RES]]
+;
+; AVX2-LABEL: define <16 x i16> @src_v8tov16_i16(
+; AVX2-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]], <8 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = icmp slt <8 x i16> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = icmp slt <8 x i16> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select <8 x i1> [[CMP_XY]], <8 x i16> [[X]], <8 x i16> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select <8 x i1> [[CMP_YZ]], <8 x i16> [[Y]], <8 x i16> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <8 x i16> [[SELECT_XZ]], <8 x i16> [[SELECT_YX]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT: ret <16 x i16> [[RES]]
+;
+; AVX512-LABEL: define <16 x i16> @src_v8tov16_i16(
+; AVX512-SAME: <8 x i16> [[X:%.*]], <8 x i16> [[Y:%.*]], <8 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = icmp slt <8 x i16> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = icmp slt <8 x i16> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select <8 x i1> [[CMP_XY]], <8 x i16> [[X]], <8 x i16> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select <8 x i1> [[CMP_YZ]], <8 x i16> [[Y]], <8 x i16> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <8 x i16> [[SELECT_XZ]], <8 x i16> [[SELECT_YX]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: ret <16 x i16> [[RES]]
+;
+ %cmp.xy = icmp slt <8 x i16> %x, %y
+ %cmp.yz = icmp slt <8 x i16> %y, %z
+ %select.xz = select <8 x i1> %cmp.xy, <8 x i16> %x, <8 x i16> %z
+ %select.yx = select <8 x i1> %cmp.yz, <8 x i16> %y, <8 x i16> %x
+ %res = shufflevector <8 x i16> %select.xz, <8 x i16> %select.yx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i16> %res
+}
+
+define <32 x i16> @src_v16tov32_i16(<16 x i16> %x, <16 x i16> %y, <16 x i16> %z) {
+; SSE-LABEL: define <32 x i16> @src_v16tov32_i16(
+; SSE-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]], <16 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[X]], <16 x i16> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[Y]], <16 x i16> [[Z]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT: [[TMP3:%.*]] = icmp slt <32 x i16> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[X]], <16 x i16> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[Z]], <16 x i16> [[X]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; SSE-NEXT: [[RES:%.*]] = select <32 x i1> [[TMP3]], <32 x i16> [[TMP4]], <32 x i16> [[TMP5]]
+; SSE-NEXT: ret <32 x i16> [[RES]]
+;
+; AVX2-LABEL: define <32 x i16> @src_v16tov32_i16(
+; AVX2-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]], <16 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <16 x i16> [[X]], <16 x i16> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <16 x i16> [[Y]], <16 x i16> [[Z]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT: [[TMP3:%.*]] = icmp slt <32 x i16> [[TMP1]], [[TMP2]]
+; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <16 x i16> [[X]], <16 x i16> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <16 x i16> [[Z]], <16 x i16> [[X]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX2-NEXT: [[RES:%.*]] = select <32 x i1> [[TMP3]], <32 x i16> [[TMP4]], <32 x i16> [[TMP5]]
+; AVX2-NEXT: ret <32 x i16> [[RES]]
+;
+; AVX512-LABEL: define <32 x i16> @src_v16tov32_i16(
+; AVX512-SAME: <16 x i16> [[X:%.*]], <16 x i16> [[Y:%.*]], <16 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = icmp slt <16 x i16> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = icmp slt <16 x i16> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select <16 x i1> [[CMP_XY]], <16 x i16> [[X]], <16 x i16> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select <16 x i1> [[CMP_YZ]], <16 x i16> [[Y]], <16 x i16> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <16 x i16> [[SELECT_XZ]], <16 x i16> [[SELECT_YX]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; AVX512-NEXT: ret <32 x i16> [[RES]]
+;
+ %cmp.xy = icmp slt <16 x i16> %x, %y
+ %cmp.yz = icmp slt <16 x i16> %y, %z
+ %select.xz = select <16 x i1> %cmp.xy, <16 x i16> %x, <16 x i16> %z
+ %select.yx = select <16 x i1> %cmp.yz, <16 x i16> %y, <16 x i16> %x
+ %res = shufflevector <16 x i16> %select.xz, <16 x i16> %select.yx, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i16> %res
+}
+
+define <8 x i32> @src_v4tov8_i32(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; SSE-LABEL: define <8 x i32> @src_v4tov8_i32(
+; SSE-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i32> [[Y]], <4 x i32> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = icmp slt <8 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i32> [[X]], <4 x i32> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i32> [[Z]], <4 x i32> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select <8 x i1> [[TMP3]], <8 x i32> [[TMP4]], <8 x i32> [[TMP5]]
+; SSE-NEXT: ret <8 x i32> [[RES]]
+;
+; AVX2-LABEL: define <8 x i32> @src_v4tov8_i32(
+; AVX2-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i32> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i32> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i32> [[X]], <4 x i32> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i32> [[Y]], <4 x i32> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x i32> [[SELECT_XZ]], <4 x i32> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x i32> [[RES]]
+;
+; AVX512-LABEL: define <8 x i32> @src_v4tov8_i32(
+; AVX512-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i32> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i32> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i32> [[X]], <4 x i32> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i32> [[Y]], <4 x i32> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x i32> [[SELECT_XZ]], <4 x i32> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x i32> [[RES]]
+;
+ %cmp.xy = icmp slt <4 x i32> %x, %y
+ %cmp.yz = icmp slt <4 x i32> %y, %z
+ %select.xz = select <4 x i1> %cmp.xy, <4 x i32> %x, <4 x i32> %z
+ %select.yx = select <4 x i1> %cmp.yz, <4 x i32> %y, <4 x i32> %x
+ %res = shufflevector <4 x i32> %select.xz, <4 x i32> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i32> %res
+}
+
+define <16 x i32> @src_v8tov16_i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
+; SSE-LABEL: define <16 x i32> @src_v8tov16_i32(
+; SSE-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]], <8 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[Y]], <8 x i32> [[Z]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[TMP3:%.*]] = icmp slt <16 x i32> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[Z]], <8 x i32> [[X]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; SSE-NEXT: [[RES:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> [[TMP5]]
+; SSE-NEXT: ret <16 x i32> [[RES]]
+;
+; AVX2-LABEL: define <16 x i32> @src_v8tov16_i32(
+; AVX2-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]], <8 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <8 x i32> [[Y]], <8 x i32> [[Z]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT: [[TMP3:%.*]] = icmp slt <16 x i32> [[TMP1]], [[TMP2]]
+; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <8 x i32> [[X]], <8 x i32> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <8 x i32> [[Z]], <8 x i32> [[X]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX2-NEXT: [[RES:%.*]] = select <16 x i1> [[TMP3]], <16 x i32> [[TMP4]], <16 x i32> [[TMP5]]
+; AVX2-NEXT: ret <16 x i32> [[RES]]
+;
+; AVX512-LABEL: define <16 x i32> @src_v8tov16_i32(
+; AVX512-SAME: <8 x i32> [[X:%.*]], <8 x i32> [[Y:%.*]], <8 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = icmp slt <8 x i32> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = icmp slt <8 x i32> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select <8 x i1> [[CMP_XY]], <8 x i32> [[X]], <8 x i32> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select <8 x i1> [[CMP_YZ]], <8 x i32> [[Y]], <8 x i32> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <8 x i32> [[SELECT_XZ]], <8 x i32> [[SELECT_YX]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; AVX512-NEXT: ret <16 x i32> [[RES]]
+;
+ %cmp.xy = icmp slt <8 x i32> %x, %y
+ %cmp.yz = icmp slt <8 x i32> %y, %z
+ %select.xz = select <8 x i1> %cmp.xy, <8 x i32> %x, <8 x i32> %z
+ %select.yx = select <8 x i1> %cmp.yz, <8 x i32> %y, <8 x i32> %x
+ %res = shufflevector <8 x i32> %select.xz, <8 x i32> %select.yx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i32> %res
+}
+
+define <32 x i32> @src_v16tov32_i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
+; CHECK-LABEL: define <32 x i32> @src_v16tov32_i32(
+; CHECK-SAME: <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[Z]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <32 x i32> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i32> [[Z]], <16 x i32> [[X]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[RES:%.*]] = select <32 x i1> [[TMP3]], <32 x i32> [[TMP4]], <32 x i32> [[TMP5]]
+; CHECK-NEXT: ret <32 x i32> [[RES]]
+;
+ %cmp.xy = icmp slt <16 x i32> %x, %y
+ %cmp.yz = icmp slt <16 x i32> %y, %z
+ %select.xz = select <16 x i1> %cmp.xy, <16 x i32> %x, <16 x i32> %z
+ %select.yx = select <16 x i1> %cmp.yz, <16 x i32> %y, <16 x i32> %x
+ %res = shufflevector <16 x i32> %select.xz, <16 x i32> %select.yx, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i32> %res
+}
+
+define <8 x i64> @src_v4tov8_i64(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+; SSE-LABEL: define <8 x i64> @src_v4tov8_i64(
+; SSE-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]], <4 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[X]], <4 x i64> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[Y]], <4 x i64> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = icmp slt <8 x i64> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[X]], <4 x i64> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[Z]], <4 x i64> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> [[TMP4]], <8 x i64> [[TMP5]]
+; SSE-NEXT: ret <8 x i64> [[RES]]
+;
+; AVX2-LABEL: define <8 x i64> @src_v4tov8_i64(
+; AVX2-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]], <4 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i64> [[X]], <4 x i64> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i64> [[Y]], <4 x i64> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: [[TMP3:%.*]] = icmp slt <8 x i64> [[TMP1]], [[TMP2]]
+; AVX2-NEXT: [[TMP4:%.*]] = shufflevector <4 x i64> [[X]], <4 x i64> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: [[TMP5:%.*]] = shufflevector <4 x i64> [[Z]], <4 x i64> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: [[RES:%.*]] = select <8 x i1> [[TMP3]], <8 x i64> [[TMP4]], <8 x i64> [[TMP5]]
+; AVX2-NEXT: ret <8 x i64> [[RES]]
+;
+; AVX512-LABEL: define <8 x i64> @src_v4tov8_i64(
+; AVX512-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]], <4 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i64> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i64> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i64> [[X]], <4 x i64> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i64> [[Y]], <4 x i64> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x i64> [[SELECT_XZ]], <4 x i64> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x i64> [[RES]]
+;
+ %cmp.xy = icmp slt <4 x i64> %x, %y
+ %cmp.yz = icmp slt <4 x i64> %y, %z
+ %select.xz = select <4 x i1> %cmp.xy, <4 x i64> %x, <4 x i64> %z
+ %select.yx = select <4 x i1> %cmp.yz, <4 x i64> %y, <4 x i64> %x
+ %res = shufflevector <4 x i64> %select.xz, <4 x i64> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x i64> %res
+}
+
+define <16 x i64> @src_v8tov16_i64(<8 x i64> %x, <8 x i64> %y, <8 x i64> %z) {
+; CHECK-LABEL: define <16 x i64> @src_v8tov16_i64(
+; CHECK-SAME: <8 x i64> [[X:%.*]], <8 x i64> [[Y:%.*]], <8 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <8 x i64> [[Y]], <8 x i64> [[Z]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <16 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <8 x i64> [[X]], <8 x i64> [[Y]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <8 x i64> [[Z]], <8 x i64> [[X]], <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+; CHECK-NEXT: [[RES:%.*]] = select <16 x i1> [[TMP3]], <16 x i64> [[TMP4]], <16 x i64> [[TMP5]]
+; CHECK-NEXT: ret <16 x i64> [[RES]]
+;
+ %cmp.xy = icmp slt <8 x i64> %x, %y
+ %cmp.yz = icmp slt <8 x i64> %y, %z
+ %select.xz = select <8 x i1> %cmp.xy, <8 x i64> %x, <8 x i64> %z
+ %select.yx = select <8 x i1> %cmp.yz, <8 x i64> %y, <8 x i64> %x
+ %res = shufflevector <8 x i64> %select.xz, <8 x i64> %select.yx, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i64> %res
+}
+
+define <32 x i64> @src_v16tov32_i64(<16 x i64> %x, <16 x i64> %y, <16 x i64> %z) {
+; CHECK-LABEL: define <32 x i64> @src_v16tov32_i64(
+; CHECK-SAME: <16 x i64> [[X:%.*]], <16 x i64> [[Y:%.*]], <16 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i64> [[X]], <16 x i64> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i64> [[Y]], <16 x i64> [[Z]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <32 x i64> [[TMP1]], [[TMP2]]
+; CHECK-NEXT: [[TMP4:%.*]] = shufflevector <16 x i64> [[X]], <16 x i64> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[TMP5:%.*]] = shufflevector <16 x i64> [[Z]], <16 x i64> [[X]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+; CHECK-NEXT: [[RES:%.*]] = select <32 x i1> [[TMP3]], <32 x i64> [[TMP4]], <32 x i64> [[TMP5]]
+; CHECK-NEXT: ret <32 x i64> [[RES]]
+;
+ %cmp.xy = icmp slt <16 x i64> %x, %y
+ %cmp.yz = icmp slt <16 x i64> %y, %z
+ %select.xz = select <16 x i1> %cmp.xy, <16 x i64> %x, <16 x i64> %z
+ %select.yx = select <16 x i1> %cmp.yz, <16 x i64> %y, <16 x i64> %x
+ %res = shufflevector <16 x i64> %select.xz, <16 x i64> %select.yx, <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
+ ret <32 x i64> %res
+}
+
+; non-sequncial mask
+define <8 x i16> @src_v4tov8_i16_random(<4 x i16> %x, <4 x i16> %y, <4 x i16> %z) {
+; CHECK-LABEL: define <8 x i16> @src_v4tov8_i16_random(
+; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i16> [[Y:%.*]], <4 x i16> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i16> [[X]], [[Y]]
+; CHECK-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i16> [[Y]], [[Z]]
+; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i16> [[X]], <4 x i16> [[Z]]
+; CHECK-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i16> [[Y]], <4 x i16> [[X]]
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[SELECT_XZ]], <4 x i16> [[SELECT_YX]], <8 x i32> <i32 3, i32 6, i32 1, i32 4, i32 0, i32 7, i32 2, i32 5>
+; CHECK-NEXT: ret <8 x i16> [[RES]]
+;
+ %cmp.xy = icmp slt <4 x i16> %x, %y
+ %cmp.yz = icmp slt <4 x i16> %y, %z
+ %select.xz = select <4 x i1> %cmp.xy, <4 x i16> %x, <4 x i16> %z
+ %select.yx = select <4 x i1> %cmp.yz, <4 x i16> %y, <4 x i16> %x
+ %res = shufflevector <4 x i16> %select.xz, <4 x i16> %select.yx, <8 x i32> <i32 3, i32 6, i32 1, i32 4, i32 0, i32 7, i32 2, i32 5>
+ ret <8 x i16> %res
+}
+
+define <8 x i32> @src_v4tov8_i32_random(<4 x i32> %x, <4 x i32> %y, <4 x i32> %z) {
+; CHECK-LABEL: define <8 x i32> @src_v4tov8_i32_random(
+; CHECK-SAME: <4 x i32> [[X:%.*]], <4 x i32> [[Y:%.*]], <4 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i32> [[X]], [[Y]]
+; CHECK-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i32> [[Y]], [[Z]]
+; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i32> [[X]], <4 x i32> [[Z]]
+; CHECK-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i32> [[Y]], <4 x i32> [[X]]
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i32> [[SELECT_XZ]], <4 x i32> [[SELECT_YX]], <8 x i32> <i32 5, i32 2, i32 7, i32 0, i32 6, i32 1, i32 4, i32 3>
+; CHECK-NEXT: ret <8 x i32> [[RES]]
+;
+ %cmp.xy = icmp slt <4 x i32> %x, %y
+ %cmp.yz = icmp slt <4 x i32> %y, %z
+ %select.xz = select <4 x i1> %cmp.xy, <4 x i32> %x, <4 x i32> %z
+ %select.yx = select <4 x i1> %cmp.yz, <4 x i32> %y, <4 x i32> %x
+ %res = shufflevector <4 x i32> %select.xz, <4 x i32> %select.yx, <8 x i32> <i32 5, i32 2, i32 7, i32 0, i32 6, i32 1, i32 4, i32 3>
+ ret <8 x i32> %res
+}
+
+define <8 x i64> @src_v4tov8_i64_random(<4 x i64> %x, <4 x i64> %y, <4 x i64> %z) {
+; CHECK-LABEL: define <8 x i64> @src_v4tov8_i64_random(
+; CHECK-SAME: <4 x i64> [[X:%.*]], <4 x i64> [[Y:%.*]], <4 x i64> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i64> [[X]], [[Y]]
+; CHECK-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i64> [[Y]], [[Z]]
+; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i64> [[X]], <4 x i64> [[Z]]
+; CHECK-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i64> [[Y]], <4 x i64> [[X]]
+; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i64> [[SELECT_XZ]], <4 x i64> [[SELECT_YX]], <8 x i32> <i32 1, i32 7, i32 4, i32 2, i32 0, i32 6, i32 3, i32 5>
+; CHECK-NEXT: ret <8 x i64> [[RES]]
+;
+ %cmp.xy = icmp slt <4 x i64> %x, %y
+ %cmp.yz = icmp slt <4 x i64> %y, %z
+ %select.xz = select <4 x i1> %cmp.xy, <4 x i64> %x, <4 x i64> %z
+ %select.yx = select <4 x i1> %cmp.yz, <4 x i64> %y, <4 x i64> %x
+ %res = shufflevector <4 x i64> %select.xz, <4 x i64> %select.yx, <8 x i32> <i32 1, i32 7, i32 4, i32 2, i32 0, i32 6, i32 3, i32 5>
+ ret <8 x i64> %res
+}
+
+; FMF flag
+define <8 x float> @src_v4tov8_float_nnan(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; SSE-LABEL: define <8 x float> @src_v4tov8_float_nnan(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = fcmp olt <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[Z]], <4 x float> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select nnan <8 x i1> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]]
+; SSE-NEXT: ret <8 x float> [[RES]]
+;
+; AVX2-LABEL: define <8 x float> @src_v4tov8_float_nnan(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select nnan <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select nnan <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x float> [[RES]]
+;
+; AVX512-LABEL: define <8 x float> @src_v4tov8_float_nnan(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select nnan <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select nnan <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x float> [[RES]]
+;
+ %cmp.xy = fcmp olt <4 x float> %x, %y
+ %cmp.yz = fcmp olt <4 x float> %y, %z
+ %select.xz = select nnan <4 x i1> %cmp.xy, <4 x float> %x, <4 x float> %z
+ %select.yx = select nnan <4 x i1> %cmp.yz, <4 x float> %y, <4 x float> %x
+ %res = shufflevector <4 x float> %select.xz, <4 x float> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @src_v4tov8_float_ninf(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; SSE-LABEL: define <8 x float> @src_v4tov8_float_ninf(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = fcmp olt <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[Z]], <4 x float> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select ninf <8 x i1> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]]
+; SSE-NEXT: ret <8 x float> [[RES]]
+;
+; AVX2-LABEL: define <8 x float> @src_v4tov8_float_ninf(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select ninf <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select ninf <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x float> [[RES]]
+;
+; AVX512-LABEL: define <8 x float> @src_v4tov8_float_ninf(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select ninf <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select ninf <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x float> [[RES]]
+;
+ %cmp.xy = fcmp olt <4 x float> %x, %y
+ %cmp.yz = fcmp olt <4 x float> %y, %z
+ %select.xz = select ninf <4 x i1> %cmp.xy, <4 x float> %x, <4 x float> %z
+ %select.yx = select ninf <4 x i1> %cmp.yz, <4 x float> %y, <4 x float> %x
+ %res = shufflevector <4 x float> %select.xz, <4 x float> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @src_v4tov8_float_nnan_ninf(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; SSE-LABEL: define <8 x float> @src_v4tov8_float_nnan_ninf(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = fcmp olt <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[Z]], <4 x float> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select nnan ninf <8 x i1> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]]
+; SSE-NEXT: ret <8 x float> [[RES]]
+;
+; AVX2-LABEL: define <8 x float> @src_v4tov8_float_nnan_ninf(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select nnan ninf <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select nnan ninf <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x float> [[RES]]
+;
+; AVX512-LABEL: define <8 x float> @src_v4tov8_float_nnan_ninf(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select nnan ninf <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select nnan ninf <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x float> [[RES]]
+;
+ %cmp.xy = fcmp olt <4 x float> %x, %y
+ %cmp.yz = fcmp olt <4 x float> %y, %z
+ %select.xz = select nnan ninf<4 x i1> %cmp.xy, <4 x float> %x, <4 x float> %z
+ %select.yx = select nnan ninf<4 x i1> %cmp.yz, <4 x float> %y, <4 x float> %x
+ %res = shufflevector <4 x float> %select.xz, <4 x float> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @src_v4tov8_float_nsz(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; SSE-LABEL: define <8 x float> @src_v4tov8_float_nsz(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = fcmp olt <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[Z]], <4 x float> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select nsz <8 x i1> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]]
+; SSE-NEXT: ret <8 x float> [[RES]]
+;
+; AVX2-LABEL: define <8 x float> @src_v4tov8_float_nsz(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x float> [[RES]]
+;
+; AVX512-LABEL: define <8 x float> @src_v4tov8_float_nsz(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x float> [[RES]]
+;
+ %cmp.xy = fcmp olt <4 x float> %x, %y
+ %cmp.yz = fcmp olt <4 x float> %y, %z
+ %select.xz = select nsz <4 x i1> %cmp.xy, <4 x float> %x, <4 x float> %z
+ %select.yx = select nsz <4 x i1> %cmp.yz, <4 x float> %y, <4 x float> %x
+ %res = shufflevector <4 x float> %select.xz, <4 x float> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @src_v4tov8_float_nnan_nsz(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; SSE-LABEL: define <8 x float> @src_v4tov8_float_nnan_nsz(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = fcmp olt <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[Z]], <4 x float> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select nnan nsz <8 x i1> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]]
+; SSE-NEXT: ret <8 x float> [[RES]]
+;
+; AVX2-LABEL: define <8 x float> @src_v4tov8_float_nnan_nsz(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select nnan nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select nnan nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x float> [[RES]]
+;
+; AVX512-LABEL: define <8 x float> @src_v4tov8_float_nnan_nsz(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select nnan nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select nnan nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x float> [[RES]]
+;
+ %cmp.xy = fcmp olt <4 x float> %x, %y
+ %cmp.yz = fcmp olt <4 x float> %y, %z
+ %select.xz = select nnan nsz <4 x i1> %cmp.xy, <4 x float> %x, <4 x float> %z
+ %select.yx = select nnan nsz <4 x i1> %cmp.yz, <4 x float> %y, <4 x float> %x
+ %res = shufflevector <4 x float> %select.xz, <4 x float> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @src_v4tov8_float_ninf_nsz(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; SSE-LABEL: define <8 x float> @src_v4tov8_float_ninf_nsz(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = fcmp olt <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[Z]], <4 x float> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select ninf nsz <8 x i1> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]]
+; SSE-NEXT: ret <8 x float> [[RES]]
+;
+; AVX2-LABEL: define <8 x float> @src_v4tov8_float_ninf_nsz(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select ninf nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select ninf nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x float> [[RES]]
+;
+; AVX512-LABEL: define <8 x float> @src_v4tov8_float_ninf_nsz(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select ninf nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select ninf nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x float> [[RES]]
+;
+ %cmp.xy = fcmp olt <4 x float> %x, %y
+ %cmp.yz = fcmp olt <4 x float> %y, %z
+ %select.xz = select ninf nsz <4 x i1> %cmp.xy, <4 x float> %x, <4 x float> %z
+ %select.yx = select ninf nsz <4 x i1> %cmp.yz, <4 x float> %y, <4 x float> %x
+ %res = shufflevector <4 x float> %select.xz, <4 x float> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
+
+define <8 x float> @src_v4tov8_i16_nnan_ninf_nsz(<4 x float> %x, <4 x float> %y, <4 x float> %z) {
+; SSE-LABEL: define <8 x float> @src_v4tov8_i16_nnan_ninf_nsz(
+; SSE-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x float> [[Y]], <4 x float> [[Z]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = fcmp olt <8 x float> [[TMP1]], [[TMP2]]
+; SSE-NEXT: [[TMP4:%.*]] = shufflevector <4 x float> [[X]], <4 x float> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP5:%.*]] = shufflevector <4 x float> [[Z]], <4 x float> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select nnan ninf nsz <8 x i1> [[TMP3]], <8 x float> [[TMP4]], <8 x float> [[TMP5]]
+; SSE-NEXT: ret <8 x float> [[RES]]
+;
+; AVX2-LABEL: define <8 x float> @src_v4tov8_i16_nnan_ninf_nsz(
+; AVX2-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX2-NEXT: [[SELECT_XZ:%.*]] = select nnan ninf nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX2-NEXT: [[SELECT_YX:%.*]] = select nnan ninf nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX2-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: ret <8 x float> [[RES]]
+;
+; AVX512-LABEL: define <8 x float> @src_v4tov8_i16_nnan_ninf_nsz(
+; AVX512-SAME: <4 x float> [[X:%.*]], <4 x float> [[Y:%.*]], <4 x float> [[Z:%.*]]) #[[ATTR0]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = fcmp olt <4 x float> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = fcmp olt <4 x float> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select nnan ninf nsz <4 x i1> [[CMP_XY]], <4 x float> [[X]], <4 x float> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select nnan ninf nsz <4 x i1> [[CMP_YZ]], <4 x float> [[Y]], <4 x float> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x float> [[SELECT_XZ]], <4 x float> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x float> [[RES]]
+;
+ %cmp.xy = fcmp olt <4 x float> %x, %y
+ %cmp.yz = fcmp olt <4 x float> %y, %z
+ %select.xz = select nnan ninf nsz <4 x i1> %cmp.xy, <4 x float> %x, <4 x float> %z
+ %select.yx = select nnan ninf nsz <4 x i1> %cmp.yz, <4 x float> %y, <4 x float> %x
+ %res = shufflevector <4 x float> %select.xz, <4 x float> %select.yx, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+ ret <8 x float> %res
+}
>From dbd388b2c9f5dfd7f689b98ca0aa62ea149808f8 Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Fri, 21 Feb 2025 02:13:24 +0900
Subject: [PATCH 2/3] [VectorCombine] Handle shuffle of selects
(shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m)
-> (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
The behaviour of SelectInst on vectors is the same as for
`V'select[i] = Condition[i] ? V'True[i] : V'False[i]`.
If a ShuffleVector is performed on two selects, it will be like:
`V'[mask] = (V'select[i] = Condition[i] ? V'True[i] : V'False[i])`
That's why a ShuffleVector with two SelectInst is equivalent to
first ShuffleVector Condition/True/False and then SelectInst that
result.
This patch implements the transforming described above.
Proof: https://alive2.llvm.org/ce/z/97wfHp
Fixed: https://github.com/llvm/llvm-project/issues/120775
---
.../Transforms/Vectorize/VectorCombine.cpp | 53 +++++++++++++++++++
.../AArch64/shuffletoidentity.ll | 25 ++++-----
.../VectorCombine/X86/shuffle-of-selects.ll | 38 +++++++++----
3 files changed, 95 insertions(+), 21 deletions(-)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 746742e14d080..5982dfa444a58 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -119,6 +119,7 @@ class VectorCombine {
bool foldConcatOfBoolMasks(Instruction &I);
bool foldPermuteOfBinops(Instruction &I);
bool foldShuffleOfBinops(Instruction &I);
+ bool foldShuffleOfSelects(Instruction &I);
bool foldShuffleOfCastops(Instruction &I);
bool foldShuffleOfShuffles(Instruction &I);
bool foldShuffleOfIntrinsics(Instruction &I);
@@ -1899,6 +1900,57 @@ bool VectorCombine::foldShuffleOfBinops(Instruction &I) {
return true;
}
+/// Try to convert,
+/// (shuffle(select(c1,t1,f1)), (select(c2,t2,f2)), m) into
+/// (select (shuffle c1,c2,m), (shuffle t1,t2,m), (shuffle f1,f2,m))
+bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
+ ArrayRef<int> Mask;
+ Value *C1, *T1, *F1, *C2, *T2, *F2;
+ if (!match(&I, m_Shuffle(
+ m_OneUse(m_Select(m_Value(C1), m_Value(T1), m_Value(F1))),
+ m_OneUse(m_Select(m_Value(C2), m_Value(T2), m_Value(F2))),
+ m_Mask(Mask))))
+ return false;
+
+ auto SelectOp = Instruction::Select;
+ auto *DstVecTy = dyn_cast<FixedVectorType>(I.getType());
+ auto *C1VecTy = dyn_cast<FixedVectorType>(C1->getType());
+ auto *C2VecTy = dyn_cast<FixedVectorType>(C2->getType());
+ if (!C1VecTy || !C2VecTy)
+ return false;
+
+ InstructionCost OldCost = TTI.getCmpSelInstrCost(
+ SelectOp, T1->getType(), C1VecTy, CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ OldCost += TTI.getCmpSelInstrCost(SelectOp, T2->getType(), C2VecTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+ OldCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy,
+ Mask, CostKind, 0, nullptr,
+ {I.getOperand(0), I.getOperand(1)}, &I);
+
+ auto *C1C2VecTy = cast<FixedVectorType>(
+ toVectorTy(Type::getInt1Ty(I.getContext()), DstVecTy->getNumElements()));
+ InstructionCost NewCost =
+ TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, C1C2VecTy, Mask,
+ CostKind, 0, nullptr, {C1, C2});
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy,
+ Mask, CostKind, 0, nullptr, {T1, T2});
+ NewCost += TTI.getShuffleCost(TargetTransformInfo::SK_PermuteTwoSrc, DstVecTy,
+ Mask, CostKind, 0, nullptr, {F1, F2});
+ NewCost += TTI.getCmpSelInstrCost(SelectOp, DstVecTy, DstVecTy,
+ CmpInst::BAD_ICMP_PREDICATE, CostKind);
+
+ if (NewCost > OldCost)
+ return false;
+
+ Value *ShuffleCmp = Builder.CreateShuffleVector(C1, C2, Mask);
+ Value *ShuffleTrue = Builder.CreateShuffleVector(T1, T2, Mask);
+ Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
+ Value *NewShuf = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
+
+ replaceValue(I, *NewShuf);
+ return true;
+}
+
/// Try to convert "shuffle (castop), (castop)" with a shared castop operand
/// into "castop (shuffle)".
bool VectorCombine::foldShuffleOfCastops(Instruction &I) {
@@ -3352,6 +3404,7 @@ bool VectorCombine::run() {
case Instruction::ShuffleVector:
MadeChange |= foldPermuteOfBinops(I);
MadeChange |= foldShuffleOfBinops(I);
+ MadeChange |= foldShuffleOfSelects(I);
MadeChange |= foldShuffleOfCastops(I);
MadeChange |= foldShuffleOfShuffles(I);
MadeChange |= foldShuffleOfIntrinsics(I);
diff --git a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
index 09875c5e0af40..e1ede6a3aab5d 100644
--- a/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
+++ b/llvm/test/Transforms/VectorCombine/AArch64/shuffletoidentity.ll
@@ -451,18 +451,18 @@ define <8 x i8> @icmpsel(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
define <8 x i8> @icmpsel_diffentcond(<8 x i8> %a, <8 x i8> %b, <8 x i8> %c, <8 x i8> %d) {
; CHECK-LABEL: @icmpsel_diffentcond(
-; CHECK-NEXT: [[AB:%.*]] = shufflevector <8 x i8> [[A:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[AT:%.*]] = shufflevector <8 x i8> [[A]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[BB:%.*]] = shufflevector <8 x i8> [[B:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
-; CHECK-NEXT: [[BT:%.*]] = shufflevector <8 x i8> [[B]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
; CHECK-NEXT: [[CB:%.*]] = shufflevector <8 x i8> [[C:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[CT:%.*]] = shufflevector <8 x i8> [[C]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
; CHECK-NEXT: [[DB:%.*]] = shufflevector <8 x i8> [[D:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: [[DT:%.*]] = shufflevector <8 x i8> [[D]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
-; CHECK-NEXT: [[ABT1:%.*]] = icmp slt <4 x i8> [[AT]], [[BT]]
-; CHECK-NEXT: [[ABB1:%.*]] = icmp ult <4 x i8> [[AB]], [[BB]]
-; CHECK-NEXT: [[ABT:%.*]] = select <4 x i1> [[ABT1]], <4 x i8> [[CT]], <4 x i8> [[DT]]
-; CHECK-NEXT: [[ABB:%.*]] = select <4 x i1> [[ABB1]], <4 x i8> [[CB]], <4 x i8> [[DB]]
+; CHECK-NEXT: [[CB1:%.*]] = shufflevector <8 x i8> [[C1:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[CT1:%.*]] = shufflevector <8 x i8> [[C1]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT: [[DB1:%.*]] = shufflevector <8 x i8> [[D1:%.*]], <8 x i8> poison, <4 x i32> <i32 3, i32 2, i32 1, i32 0>
+; CHECK-NEXT: [[DT1:%.*]] = shufflevector <8 x i8> [[D1]], <8 x i8> poison, <4 x i32> <i32 7, i32 6, i32 5, i32 4>
+; CHECK-NEXT: [[ABT1:%.*]] = icmp slt <4 x i8> [[CT]], [[DT]]
+; CHECK-NEXT: [[ABB1:%.*]] = icmp ult <4 x i8> [[CB]], [[DB]]
+; CHECK-NEXT: [[ABT:%.*]] = select <4 x i1> [[ABT1]], <4 x i8> [[CT1]], <4 x i8> [[DT1]]
+; CHECK-NEXT: [[ABB:%.*]] = select <4 x i1> [[ABB1]], <4 x i8> [[CB1]], <4 x i8> [[DB1]]
; CHECK-NEXT: [[R:%.*]] = shufflevector <4 x i8> [[ABT]], <4 x i8> [[ABB]], <8 x i32> <i32 7, i32 6, i32 5, i32 4, i32 3, i32 2, i32 1, i32 0>
; CHECK-NEXT: ret <8 x i8> [[R]]
;
@@ -992,14 +992,15 @@ define void @maximal_legal_fpmath(ptr %addr1, ptr %addr2, ptr %result, float %va
}
; Peek through (repeated) bitcasts to find a common source value.
+; TODO : We can remove the Shufflevector for A, B.
define <4 x i64> @bitcast_smax_v8i32_v4i32(<4 x i64> %a, <4 x i64> %b) {
; CHECK-LABEL: @bitcast_smax_v8i32_v4i32(
; CHECK-NEXT: [[A_BC0:%.*]] = bitcast <4 x i64> [[A:%.*]] to <8 x i32>
; CHECK-NEXT: [[B_BC0:%.*]] = bitcast <4 x i64> [[B:%.*]] to <8 x i32>
-; CHECK-NEXT: [[CMP:%.*]] = icmp slt <8 x i32> [[A_BC0]], [[B_BC0]]
-; CHECK-NEXT: [[A_BC1:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32>
-; CHECK-NEXT: [[B_BC1:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32>
-; CHECK-NEXT: [[CONCAT:%.*]] = select <8 x i1> [[CMP]], <8 x i32> [[B_BC1]], <8 x i32> [[A_BC1]]
+; CHECK-NEXT: [[TMP1:%.*]] = icmp slt <8 x i32> [[A_BC0]], [[B_BC0]]
+; CHECK-NEXT: [[TMP5:%.*]] = bitcast <4 x i64> [[A]] to <8 x i32>
+; CHECK-NEXT: [[TMP3:%.*]] = bitcast <4 x i64> [[B]] to <8 x i32>
+; CHECK-NEXT: [[CONCAT:%.*]] = select <8 x i1> [[TMP1]], <8 x i32> [[TMP3]], <8 x i32> [[TMP5]]
; CHECK-NEXT: [[RES:%.*]] = bitcast <8 x i32> [[CONCAT]] to <4 x i64>
; CHECK-NEXT: ret <4 x i64> [[RES]]
;
diff --git a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
index 91a650f751902..cc51734e5a6c3 100644
--- a/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
+++ b/llvm/test/Transforms/VectorCombine/X86/shuffle-of-selects.ll
@@ -4,14 +4,34 @@
; RUN: opt < %s -passes=vector-combine -S -mtriple=x86_64-- -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=CHECK,AVX512
define <8 x i16> @src_v4tov8_i16(<4 x i16> %x, <4 x i16> %y, <4 x i16> %z) {
-; CHECK-LABEL: define <8 x i16> @src_v4tov8_i16(
-; CHECK-SAME: <4 x i16> [[X:%.*]], <4 x i16> [[Y:%.*]], <4 x i16> [[Z:%.*]]) #[[ATTR0:[0-9]+]] {
-; CHECK-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i16> [[X]], [[Y]]
-; CHECK-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i16> [[Y]], [[Z]]
-; CHECK-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i16> [[X]], <4 x i16> [[Z]]
-; CHECK-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i16> [[Y]], <4 x i16> [[X]]
-; CHECK-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[SELECT_XZ]], <4 x i16> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
-; CHECK-NEXT: ret <8 x i16> [[RES]]
+; SSE-LABEL: define <8 x i16> @src_v4tov8_i16(
+; SSE-SAME: <4 x i16> [[X:%.*]], <4 x i16> [[Y:%.*]], <4 x i16> [[Z:%.*]]) #[[ATTR0:[0-9]+]] {
+; SSE-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i16> [[X]], [[Y]]
+; SSE-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i16> [[Y]], [[Z]]
+; SSE-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP_XY]], <4 x i1> [[CMP_YZ]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[X]], <4 x i16> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[Z]], <4 x i16> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; SSE-NEXT: [[RES:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]
+; SSE-NEXT: ret <8 x i16> [[RES]]
+;
+; AVX2-LABEL: define <8 x i16> @src_v4tov8_i16(
+; AVX2-SAME: <4 x i16> [[X:%.*]], <4 x i16> [[Y:%.*]], <4 x i16> [[Z:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX2-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i16> [[X]], [[Y]]
+; AVX2-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i16> [[Y]], [[Z]]
+; AVX2-NEXT: [[TMP1:%.*]] = shufflevector <4 x i1> [[CMP_XY]], <4 x i1> [[CMP_YZ]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: [[TMP2:%.*]] = shufflevector <4 x i16> [[X]], <4 x i16> [[Y]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: [[TMP3:%.*]] = shufflevector <4 x i16> [[Z]], <4 x i16> [[X]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX2-NEXT: [[RES:%.*]] = select <8 x i1> [[TMP1]], <8 x i16> [[TMP2]], <8 x i16> [[TMP3]]
+; AVX2-NEXT: ret <8 x i16> [[RES]]
+;
+; AVX512-LABEL: define <8 x i16> @src_v4tov8_i16(
+; AVX512-SAME: <4 x i16> [[X:%.*]], <4 x i16> [[Y:%.*]], <4 x i16> [[Z:%.*]]) #[[ATTR0:[0-9]+]] {
+; AVX512-NEXT: [[CMP_XY:%.*]] = icmp slt <4 x i16> [[X]], [[Y]]
+; AVX512-NEXT: [[CMP_YZ:%.*]] = icmp slt <4 x i16> [[Y]], [[Z]]
+; AVX512-NEXT: [[SELECT_XZ:%.*]] = select <4 x i1> [[CMP_XY]], <4 x i16> [[X]], <4 x i16> [[Z]]
+; AVX512-NEXT: [[SELECT_YX:%.*]] = select <4 x i1> [[CMP_YZ]], <4 x i16> [[Y]], <4 x i16> [[X]]
+; AVX512-NEXT: [[RES:%.*]] = shufflevector <4 x i16> [[SELECT_XZ]], <4 x i16> [[SELECT_YX]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+; AVX512-NEXT: ret <8 x i16> [[RES]]
;
%cmp.xy = icmp slt <4 x i16> %x, %y
%cmp.yz = icmp slt <4 x i16> %y, %z
@@ -173,7 +193,7 @@ define <16 x i32> @src_v8tov16_i32(<8 x i32> %x, <8 x i32> %y, <8 x i32> %z) {
define <32 x i32> @src_v16tov32_i32(<16 x i32> %x, <16 x i32> %y, <16 x i32> %z) {
; CHECK-LABEL: define <32 x i32> @src_v16tov32_i32(
-; CHECK-SAME: <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> [[Z:%.*]]) #[[ATTR0]] {
+; CHECK-SAME: <16 x i32> [[X:%.*]], <16 x i32> [[Y:%.*]], <16 x i32> [[Z:%.*]]) #[[ATTR0:[0-9]+]] {
; CHECK-NEXT: [[TMP1:%.*]] = shufflevector <16 x i32> [[X]], <16 x i32> [[Y]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP2:%.*]] = shufflevector <16 x i32> [[Y]], <16 x i32> [[Z]], <32 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23, i32 24, i32 25, i32 26, i32 27, i32 28, i32 29, i32 30, i32 31>
; CHECK-NEXT: [[TMP3:%.*]] = icmp slt <32 x i32> [[TMP1]], [[TMP2]]
>From b519b561bf037512d1722295992f4015d26d753a Mon Sep 17 00:00:00 2001
From: hanbeom <kese111 at gmail.com>
Date: Sat, 22 Feb 2025 19:05:06 +0900
Subject: [PATCH 3/3] add final result to worklist
---
llvm/lib/Transforms/Vectorize/VectorCombine.cpp | 1 +
1 file changed, 1 insertion(+)
diff --git a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
index 5982dfa444a58..18f3478fee20a 100644
--- a/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
+++ b/llvm/lib/Transforms/Vectorize/VectorCombine.cpp
@@ -1947,6 +1947,7 @@ bool VectorCombine::foldShuffleOfSelects(Instruction &I) {
Value *ShuffleFalse = Builder.CreateShuffleVector(F1, F2, Mask);
Value *NewShuf = Builder.CreateSelect(ShuffleCmp, ShuffleTrue, ShuffleFalse);
+ Worklist.pushValue(NewShuf);
replaceValue(I, *NewShuf);
return true;
}
More information about the llvm-commits
mailing list