[llvm] goldsteinn/simplify x86 cmp (PR #84360)
via llvm-commits
llvm-commits at lists.llvm.org
Thu Mar 7 11:01:05 PST 2024
https://github.com/goldsteinn created https://github.com/llvm/llvm-project/pull/84360
- **[X86] Add tests for folding `icmp` of `v8i32` -> `fcmp` of `v8f32` on AVX; NFC**
- **[X86] Try Folding `icmp` of `v8i32` -> `fcmp` of `v8f32` on AVX**
- **[X86] Improve helper for simplifying demanded bits of compares**
>From ef29fdf76be2f84c3afcaf17452d65dd50ee647a Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Mon, 19 Feb 2024 16:13:24 -0600
Subject: [PATCH 1/3] [X86] Add tests for folding `icmp` of `v8i32` -> `fcmp`
of `v8f32` on AVX; NFC
---
llvm/test/CodeGen/X86/cmpf-avx.ll | 263 ++++++++++++++++++++++++++++++
1 file changed, 263 insertions(+)
create mode 100644 llvm/test/CodeGen/X86/cmpf-avx.ll
diff --git a/llvm/test/CodeGen/X86/cmpf-avx.ll b/llvm/test/CodeGen/X86/cmpf-avx.ll
new file mode 100644
index 00000000000000..d37ea66ae586c5
--- /dev/null
+++ b/llvm/test/CodeGen/X86/cmpf-avx.ll
@@ -0,0 +1,263 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=CHECK,X64
+
+define <8 x i32> @cmp_eq_bitcast(<8 x i32> %x) {
+; X86-LABEL: cmp_eq_bitcast:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
+; X86-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; X86-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_eq_bitcast:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
+; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %and = and <8 x i32> %x, <i32 7, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %cmp = icmp eq <8 x i32> %and, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ne_sitofp(<8 x i32> %x) {
+; CHECK-LABEL: cmp_ne_sitofp:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; CHECK-NEXT: vpxor %xmm3, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp ne <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_slt_fail_no_const(<8 x i32> %x, <8 x i32> %y) {
+; X86-LABEL: cmp_slt_fail_no_const:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X86-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; X86-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_slt_fail_no_const:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vextractf128 $1, %ymm1, %xmm2
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm3
+; X64-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
+; X64-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; X64-NEXT: retq
+ %and = and <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %cmp = icmp slt <8 x i32> %and, %y
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_sitofp(<8 x i32> %x) {
+; CHECK-LABEL: cmp_eq_sitofp:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [4294967293,4294967293,4294967293,4294967293]
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp eq <8 x i32> %x, <i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_sgt_fail_no_bounds(<8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: cmp_sgt_fail_no_bounds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm3
+; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp slt <8 x i32> %x, %y
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_sgt_bitcast(<8 x i32> %xx, <8 x i32> %yy) {
+; CHECK-LABEL: cmp_sgt_bitcast:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vbroadcastss {{.*#+}} ymm2 = [2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040,2139095040]
+; CHECK-NEXT: vandps %ymm2, %ymm0, %ymm0
+; CHECK-NEXT: vandps %ymm2, %ymm1, %ymm1
+; CHECK-NEXT: vextractf128 $1, %ymm1, %xmm2
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm3
+; CHECK-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm2
+; CHECK-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %x = and <8 x i32> %xx, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %y = and <8 x i32> %yy, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+
+ %cmp = icmp sgt <8 x i32> %x, %y
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_sle_fail_out_of_bounds(<8 x i32> %xx) {
+; X86-LABEL: cmp_sle_fail_out_of_bounds:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [2139095041,2139095041,2139095041,2139095041]
+; X86-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; X86-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_sle_fail_out_of_bounds:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [2139095041,2139095041,2139095041,2139095041]
+; X64-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; X64-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %x = and <8 x i32> %xx, <i32 2139095041, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %cmp = icmp sle <8 x i32> %x, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_fail_out_of_bounds(<8 x i32> %x) {
+; CHECK-LABEL: cmp_eq_fail_out_of_bounds:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [16777216,16777216,16777216,16777216]
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp eq <8 x i32> %x, <i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216, i32 16777216>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_fail_out_of_bounds2(<8 x i32> %x) {
+; CHECK-LABEL: cmp_eq_fail_out_of_bounds2:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [4278190080,4278190080,4278190080,4278190080]
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp eq <8 x i32> %x, <i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216, i32 -16777216>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_eq_todo(<8 x i32> %x) {
+; X86-LABEL: cmp_eq_todo:
+; X86: # %bb.0:
+; X86-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X86-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_eq_todo:
+; X64: # %bb.0:
+; X64-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm0
+; X64-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; X64-NEXT: retq
+ %cmp = icmp eq <8 x i32> %x, <i32 -16777215, i32 16777215, i32 16777215, i32 -16777215, i32 16777215, i32 -16777215, i32 16777215, i32 -16777215>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ult_fail_maybe_negative(<8 x i32> %x) {
+; CHECK-LABEL: cmp_ult_fail_maybe_negative:
+; CHECK: # %bb.0:
+; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
+; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [2,2,2,2]
+; CHECK-NEXT: vpminud %xmm2, %xmm1, %xmm3
+; CHECK-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; CHECK-NEXT: vpminud %xmm2, %xmm0, %xmm2
+; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
+; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; CHECK-NEXT: ret{{[l|q]}}
+ %cmp = icmp ult <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ule_bitcast(<8 x i32> %xx) {
+; X86-LABEL: cmp_ule_bitcast:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
+; X86-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; X86-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_ule_bitcast:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
+; X64-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
+; X64-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %x = and <8 x i32> %xx, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
+ %cmp = icmp ule <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
+
+define <8 x i32> @cmp_ugt_sitofp(<8 x i32> %xx) {
+; X86-LABEL: cmp_ugt_sitofp:
+; X86: # %bb.0:
+; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
+; X86-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
+; X86-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_ugt_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
+; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
+; X64-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
+; X64-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: retq
+ %x = and <8 x i32> %xx, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+ %cmp = icmp ugt <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
+ %sext = sext <8 x i1> %cmp to <8 x i32>
+ ret <8 x i32> %sext
+}
>From 318cea6041702f71c45d103daa5d930157390ac1 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Thu, 7 Mar 2024 11:39:33 -0600
Subject: [PATCH 2/3] [X86] Try Folding `icmp` of `v8i32` -> `fcmp` of `v8f32`
on AVX
Fixes: #82242
The idea is that AVX doesn't support comparisons for `v8i32` so it
splits the comparison into 2x `v4i32` comparisons + reconstruction of
the `v8i32`.
By converting to a float, we can handle the comparison with 1/2
instructions (1 if we can `bitcast`, 2 if we need to cast with
`sitofp`).
The Proofs: https://alive2.llvm.org/ce/z/AJDdQ8
Timeout, but they can be reproduced locally.
---
.../CodeGen/SelectionDAG/TargetLowering.cpp | 28 +
llvm/lib/Target/X86/X86ISelLowering.cpp | 130 +
.../X86/bitcast-int-to-vector-bool-sext.ll | 27 +-
.../X86/bitcast-int-to-vector-bool-zext.ll | 27 +-
llvm/test/CodeGen/X86/cmpf-avx.ll | 85 +-
.../CodeGen/X86/combine-sse41-intrinsics.ll | 3 +-
llvm/test/CodeGen/X86/combine-testps.ll | 25 +-
llvm/test/CodeGen/X86/masked_compressstore.ll | 34 +-
llvm/test/CodeGen/X86/masked_expandload.ll | 48 +-
llvm/test/CodeGen/X86/masked_gather.ll | 134 +-
llvm/test/CodeGen/X86/masked_load.ll | 8 +-
llvm/test/CodeGen/X86/masked_store.ll | 69 +-
llvm/test/CodeGen/X86/masked_store_trunc.ll | 77 +-
.../CodeGen/X86/masked_store_trunc_ssat.ll | 77 +-
.../CodeGen/X86/masked_store_trunc_usat.ll | 77 +-
llvm/test/CodeGen/X86/nontemporal-loads.ll | 18 +-
llvm/test/CodeGen/X86/pr48215.ll | 19 +-
llvm/test/CodeGen/X86/sadd_sat_vec.ll | 26 +-
llvm/test/CodeGen/X86/setcc-lowering.ll | 7 +-
llvm/test/CodeGen/X86/ssub_sat_vec.ll | 67 +-
llvm/test/CodeGen/X86/v8i1-masks.ll | 16 +-
llvm/test/CodeGen/X86/vec_saddo.ll | 90 +-
llvm/test/CodeGen/X86/vec_ssubo.ll | 90 +-
llvm/test/CodeGen/X86/vec_umulo.ll | 71 +-
.../X86/vector-constrained-fp-intrinsics.ll | 18 +-
llvm/test/CodeGen/X86/vector-pcmp.ll | 22 +-
.../CodeGen/X86/vector-popcnt-256-ult-ugt.ll | 3135 ++++++++---------
.../CodeGen/X86/vector-reduce-fmaximum.ll | 58 -
.../test/CodeGen/X86/vector-reduce-or-bool.ll | 24 +-
.../CodeGen/X86/vector-reduce-xor-bool.ll | 22 +-
llvm/test/CodeGen/X86/vector-sext.ll | 9 +-
llvm/test/CodeGen/X86/vector-unsigned-cmp.ll | 17 +-
32 files changed, 2260 insertions(+), 2298 deletions(-)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index a639cba5e35a80..2e1443b97d7a61 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -816,6 +816,18 @@ SDValue TargetLowering::SimplifyMultipleUseDemandedBits(
}
break;
}
+ case ISD::SINT_TO_FP: {
+ EVT InnerVT = Op.getOperand(0).getValueType();
+ if (DemandedBits.isSignMask() &&
+ VT.getScalarSizeInBits() == InnerVT.getScalarSizeInBits())
+ return DAG.getBitcast(VT, Op.getOperand(0));
+ break;
+ }
+ case ISD::UINT_TO_FP: {
+ if (DemandedBits.isSignMask())
+ return DAG.getConstant(0, SDLoc(Op), VT);
+ break;
+ }
case ISD::SIGN_EXTEND_INREG: {
// If none of the extended bits are demanded, eliminate the sextinreg.
SDValue Op0 = Op.getOperand(0);
@@ -2313,6 +2325,22 @@ bool TargetLowering::SimplifyDemandedBits(
Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
break;
}
+ case ISD::SINT_TO_FP: {
+ EVT InnerVT = Op.getOperand(0).getValueType();
+ if (DemandedBits.isSignMask() &&
+ VT.getScalarSizeInBits() == InnerVT.getScalarSizeInBits())
+ return TLO.CombineTo(Op, TLO.DAG.getBitcast(VT, Op.getOperand(0)));
+
+ Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+ break;
+ }
+ case ISD::UINT_TO_FP: {
+ if (DemandedBits.isSignMask())
+ return TLO.CombineTo(Op, TLO.DAG.getConstant(0, dl, VT));
+
+ Known = TLO.DAG.computeKnownBits(Op, DemandedElts, Depth);
+ break;
+ }
case ISD::SIGN_EXTEND_INREG: {
SDValue Op0 = Op.getOperand(0);
EVT ExVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 94c4bbc4a09993..bb65a42ffd7b69 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23352,6 +23352,136 @@ static SDValue LowerVSETCC(SDValue Op, const X86Subtarget &Subtarget,
}
}
+ // We get bad codegen for v8i32 compares on avx targets (without avx2) so if
+ // possible convert to a v8f32 compare.
+ if (VTOp0 == MVT::v8i32 && Subtarget.hasAVX() && !Subtarget.hasAVX2()) {
+ std::optional<KnownBits> KnownOps[2];
+ // Check if an op is known to be in a certain range.
+ auto OpInRange = [&DAG, Op, &KnownOps](unsigned OpNo, bool CmpLT,
+ const APInt Bound) {
+ if (!KnownOps[OpNo].has_value())
+ KnownOps[OpNo] = DAG.computeKnownBits(Op.getOperand(OpNo));
+
+ if (KnownOps[OpNo]->isUnknown())
+ return false;
+
+ std::optional<bool> Res;
+ if (CmpLT)
+ Res = KnownBits::ult(*KnownOps[OpNo], KnownBits::makeConstant(Bound));
+ else
+ Res = KnownBits::ugt(*KnownOps[OpNo], KnownBits::makeConstant(Bound));
+ return Res.has_value() && *Res;
+ };
+
+ bool OkayCvt = false;
+ bool OkayBitcast = false;
+
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(MVT::f32);
+
+ // For cvt up to 1 << (Significand Precision), (1 << 24 for ieee float)
+ const APInt MaxConvertableCvt =
+ APInt::getOneBitSet(32, APFloat::semanticsPrecision(Sem));
+ // For bitcast up to (and including) first inf representation (0x7f800000 +
+ // 1 for ieee float)
+ const APInt MaxConvertableBitcast =
+ APFloat::getInf(Sem).bitcastToAPInt() + 1;
+ // For bitcast we also exclude de-norm values. This is absolutely necessary
+ // for strict semantic correctness, but DAZ (de-norm as zero) will break if
+ // we don't have this check.
+ const APInt MinConvertableBitcast =
+ APFloat::getSmallestNormalized(Sem).bitcastToAPInt() - 1;
+
+ assert(
+ MaxConvertableBitcast.getBitWidth() == 32 &&
+ MaxConvertableCvt == (1U << 24) &&
+ MaxConvertableBitcast == 0x7f800001 &&
+ MinConvertableBitcast.isNonNegative() &&
+ MaxConvertableBitcast.sgt(MinConvertableBitcast) &&
+ "This transform has only been verified to IEEE Single Precision Float");
+
+ // For bitcast we need both lhs/op1 u< MaxConvertableBitcast
+ // NB: It might be worth it to enable to bitcast version for unsigned avx2
+ // comparisons as they typically require multiple instructions to lower
+ // (they don't fit `vpcmpeq`/`vpcmpgt` well).
+ if (OpInRange(1, /*CmpLT*/ true, MaxConvertableBitcast) &&
+ OpInRange(1, /*CmpLT*/ false, MinConvertableBitcast) &&
+ OpInRange(0, /*CmpLT*/ true, MaxConvertableBitcast) &&
+ OpInRange(0, /*CmpLT*/ false, MinConvertableBitcast)) {
+ OkayBitcast = true;
+ }
+ // We want to convert icmp -> fcmp using `sitofp` iff one of the converts
+ // will be constant folded.
+ else if ((DAG.isConstantValueOfAnyType(peekThroughBitcasts(Op1)) ||
+ DAG.isConstantValueOfAnyType(peekThroughBitcasts(Op0)))) {
+ if (isUnsignedIntSetCC(Cond)) {
+ // For cvt + unsigned compare we need both lhs/rhs >= 0 and either lhs
+ // or rhs < MaxConvertableCvt
+
+ if (OpInRange(1, /*CmpLT*/ true, APInt::getSignedMinValue(32)) &&
+ OpInRange(0, /*CmpLT*/ true, APInt::getSignedMinValue(32)) &&
+ (OpInRange(1, /*CmpLT*/ true, MaxConvertableCvt) ||
+ OpInRange(0, /*CmpLT*/ true, MaxConvertableCvt)))
+ OkayCvt = true;
+ } else {
+ // For cvt + signed compare we need abs(lhs) or abs(rhs) <
+ // MaxConvertableCvt
+ if (OpInRange(1, /*CmpLT*/ true, MaxConvertableCvt) ||
+ OpInRange(1, /*CmpLT*/ false, -MaxConvertableCvt) ||
+ OpInRange(0, /*CmpLT*/ true, MaxConvertableCvt) ||
+ OpInRange(0, /*CmpLT*/ false, -MaxConvertableCvt))
+ OkayCvt = true;
+ }
+ }
+ // TODO: If we can't prove any of the ranges, we could unconditionally lower
+ // `(icmp eq lhs, rhs)` as `(icmp eq (int_to_fp (xor lhs, rhs)), zero)`
+ if (OkayBitcast || OkayCvt) {
+ switch (Cond) {
+ default:
+ llvm_unreachable("Unexpected SETCC condition");
+ // Get the new FP condition. Note for the unsigned conditions we have
+ // verified its okay to convert to the signed version.
+ case ISD::SETULT:
+ case ISD::SETLT:
+ Cond = ISD::SETOLT;
+ break;
+ case ISD::SETUGT:
+ case ISD::SETGT:
+ Cond = ISD::SETOGT;
+ break;
+ case ISD::SETULE:
+ case ISD::SETLE:
+ Cond = ISD::SETOLE;
+ break;
+ case ISD::SETUGE:
+ case ISD::SETGE:
+ Cond = ISD::SETOGE;
+ break;
+ case ISD::SETEQ:
+ Cond = ISD::SETOEQ;
+ break;
+ case ISD::SETNE:
+ Cond = ISD::SETONE;
+ break;
+ }
+
+ MVT FpVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+ SDNodeFlags Flags;
+ Flags.setNoNaNs(true);
+ Flags.setNoInfs(true);
+ Flags.setNoSignedZeros(true);
+ if (OkayBitcast) {
+ Op0 = DAG.getBitcast(FpVT, Op0);
+ Op1 = DAG.getBitcast(FpVT, Op1);
+ } else {
+ Op0 = DAG.getNode(ISD::SINT_TO_FP, dl, FpVT, Op0);
+ Op1 = DAG.getNode(ISD::SINT_TO_FP, dl, FpVT, Op1);
+ }
+ Op0->setFlags(Flags);
+ Op1->setFlags(Flags);
+ return DAG.getSetCC(dl, VT, Op0, Op1, Cond);
+ }
+ }
+
// Break 256-bit integer vector compare into smaller ones.
if (VT.is256BitVector() && !Subtarget.hasInt256())
return splitIntVSETCC(VT, Op0, Op1, Cond, DAG, dl);
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
index 6255621d870e12..eef2b3db5d694e 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-sext.ll
@@ -256,12 +256,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i8_8i32:
@@ -487,18 +484,12 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm2 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: ext_i16_16i32:
diff --git a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
index d2794df731b65d..5c810797bd2b75 100644
--- a/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
+++ b/llvm/test/CodeGen/X86/bitcast-int-to-vector-bool-zext.ll
@@ -320,12 +320,9 @@ define <8 x i32> @ext_i8_8i32(i8 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -613,20 +610,14 @@ define <16 x i32> @ext_i16_16i32(i16 %a0) {
; AVX1-NEXT: vmovd %edi, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm1
-; AVX1-NEXT: vmovaps {{.*#+}} ymm0 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm2
-; AVX1-NEXT: vpcmpeqd %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm2
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [1,1,1,1,1,1,1,1]
; AVX1-NEXT: vandps %ymm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm3 = [256,512,1024,2048,4096,8192,16384,32768]
-; AVX1-NEXT: vandps %ymm3, %ymm1, %ymm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm1
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm3, %ymm1
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; AVX1-NEXT: vandps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/cmpf-avx.ll b/llvm/test/CodeGen/X86/cmpf-avx.ll
index d37ea66ae586c5..15b909350b2675 100644
--- a/llvm/test/CodeGen/X86/cmpf-avx.ll
+++ b/llvm/test/CodeGen/X86/cmpf-avx.ll
@@ -6,21 +6,15 @@ define <8 x i32> @cmp_eq_bitcast(<8 x i32> %x) {
; X86-LABEL: cmp_eq_bitcast:
; X86: # %bb.0:
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
-; X86-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; X86-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp_eq_bitcast:
; X64: # %bb.0:
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
-; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%and = and <8 x i32> %x, <i32 7, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%cmp = icmp eq <8 x i32> %and, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -29,17 +23,17 @@ define <8 x i32> @cmp_eq_bitcast(<8 x i32> %x) {
}
define <8 x i32> @cmp_ne_sitofp(<8 x i32> %x) {
-; CHECK-LABEL: cmp_ne_sitofp:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpxor %xmm3, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vpxor %xmm3, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: ret{{[l|q]}}
+; X86-LABEL: cmp_ne_sitofp:
+; X86: # %bb.0:
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vcmpneq_oqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_ne_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vcmpneq_oqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
%cmp = icmp ne <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
%sext = sext <8 x i1> %cmp to <8 x i32>
ret <8 x i32> %sext
@@ -72,14 +66,17 @@ define <8 x i32> @cmp_slt_fail_no_const(<8 x i32> %x, <8 x i32> %y) {
}
define <8 x i32> @cmp_eq_sitofp(<8 x i32> %x) {
-; CHECK-LABEL: cmp_eq_sitofp:
-; CHECK: # %bb.0:
-; CHECK-NEXT: vextractf128 $1, %ymm0, %xmm1
-; CHECK-NEXT: vbroadcastss {{.*#+}} xmm2 = [4294967293,4294967293,4294967293,4294967293]
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; CHECK-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; CHECK-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; CHECK-NEXT: ret{{[l|q]}}
+; X86-LABEL: cmp_eq_sitofp:
+; X86: # %bb.0:
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: retl
+;
+; X64-LABEL: cmp_eq_sitofp:
+; X64: # %bb.0:
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: retq
%cmp = icmp eq <8 x i32> %x, <i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3, i32 -3>
%sext = sext <8 x i1> %cmp to <8 x i32>
ret <8 x i32> %sext
@@ -214,21 +211,15 @@ define <8 x i32> @cmp_ule_bitcast(<8 x i32> %xx) {
; X86-LABEL: cmp_ule_bitcast:
; X86: # %bb.0:
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
-; X86-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; X86-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp_ule_bitcast:
; X64: # %bb.0:
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [4,4,4,4]
-; X64-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; X64-NEXT: vpcmpgtd %xmm0, %xmm2, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
%x = and <8 x i32> %xx, <i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040, i32 2139095040>
%cmp = icmp ule <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
@@ -240,21 +231,17 @@ define <8 x i32> @cmp_ugt_sitofp(<8 x i32> %xx) {
; X86-LABEL: cmp_ugt_sitofp:
; X86: # %bb.0:
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
-; X86-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; X86-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
+; X86-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: cmp_ugt_sitofp:
; X64: # %bb.0:
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-NEXT: vbroadcastss {{.*#+}} xmm2 = [3,3,3,3]
-; X64-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
+; X64-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
%x = and <8 x i32> %xx, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
%cmp = icmp ugt <8 x i32> %x, <i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3, i32 3>
diff --git a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
index cbb5bd09c2399a..a332b3e8908003 100644
--- a/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -164,14 +164,13 @@ define <4 x float> @demandedbits_sitofp_blendvps(<4 x float> %a0, <4 x float> %a
; SSE-LABEL: demandedbits_sitofp_blendvps:
; SSE: # %bb.0:
; SSE-NEXT: movaps %xmm0, %xmm3
-; SSE-NEXT: cvtdq2ps %xmm2, %xmm0
+; SSE-NEXT: movaps %xmm2, %xmm0
; SSE-NEXT: blendvps %xmm0, %xmm1, %xmm3
; SSE-NEXT: movaps %xmm3, %xmm0
; SSE-NEXT: retq
;
; AVX-LABEL: demandedbits_sitofp_blendvps:
; AVX: # %bb.0:
-; AVX-NEXT: vcvtdq2ps %xmm2, %xmm2
; AVX-NEXT: vblendvps %xmm2, %xmm1, %xmm0, %xmm0
; AVX-NEXT: retq
%cvt = sitofp <4 x i32> %a2 to <4 x float>
diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll
index 43dddbecf51a7d..66165ce2aa53a5 100644
--- a/llvm/test/CodeGen/X86/combine-testps.ll
+++ b/llvm/test/CodeGen/X86/combine-testps.ll
@@ -171,13 +171,24 @@ define i32 @testpsz_128_signbit(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b)
}
define i32 @testpsnzc_256_signbit(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b) {
-; CHECK-LABEL: testpsnzc_256_signbit:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: vtestps %ymm1, %ymm0
-; CHECK-NEXT: cmovnel %esi, %eax
-; CHECK-NEXT: vzeroupper
-; CHECK-NEXT: retq
+; AVX-LABEL: testpsnzc_256_signbit:
+; AVX: # %bb.0:
+; AVX-NEXT: movl %edi, %eax
+; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX-NEXT: vcmpltps %ymm2, %ymm0, %ymm0
+; AVX-NEXT: vtestps %ymm1, %ymm0
+; AVX-NEXT: cmovnel %esi, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
+;
+; AVX2-LABEL: testpsnzc_256_signbit:
+; AVX2: # %bb.0:
+; AVX2-NEXT: movl %edi, %eax
+; AVX2-NEXT: vtestps %ymm1, %ymm0
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: vzeroupper
+; AVX2-NEXT: retq
%t0 = bitcast <8 x float> %c to <8 x i32>
%t1 = icmp sgt <8 x i32> zeroinitializer, %t0
%t2 = sext <8 x i1> %t1 to <8 x i32>
diff --git a/llvm/test/CodeGen/X86/masked_compressstore.ll b/llvm/test/CodeGen/X86/masked_compressstore.ll
index 3187bf6448690e..0ab572a50ed3de 100644
--- a/llvm/test/CodeGen/X86/masked_compressstore.ll
+++ b/llvm/test/CodeGen/X86/masked_compressstore.ll
@@ -1844,25 +1844,25 @@ define void @compressstore_v32f32_v32i32(ptr %base, <32 x float> %V, <32 x i32>
;
; AVX1-LABEL: compressstore_v32f32_v32i32:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8
-; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5
+; AVX1-NEXT: vxorps %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9
+; AVX1-NEXT: vpackssdw %xmm9, %xmm5, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4
+; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9
+; AVX1-NEXT: vpackssdw %xmm9, %xmm4, %xmm4
; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpmovmskb %xmm4, %ecx
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm9, %xmm5
-; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm9, %xmm6
-; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm7, %ymm4
+; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vcvtdq2ps %ymm6, %ymm5
+; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpacksswb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmovmskb %xmm4, %eax
; AVX1-NEXT: shll $16, %eax
diff --git a/llvm/test/CodeGen/X86/masked_expandload.ll b/llvm/test/CodeGen/X86/masked_expandload.ll
index 4c5b67962a58bd..2a010e78b41edb 100644
--- a/llvm/test/CodeGen/X86/masked_expandload.ll
+++ b/llvm/test/CodeGen/X86/masked_expandload.ll
@@ -691,14 +691,14 @@ define <16 x double> @expandload_v16f64_v16i32(ptr %base, <16 x double> %src0, <
;
; AVX1-LABEL: expandload_v16f64_v16i32:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
-; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5
+; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vcmpeqps %ymm6, %ymm5, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm7
+; AVX1-NEXT: vpackssdw %xmm7, %xmm5, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4
+; AVX1-NEXT: vcmpeqps %ymm6, %ymm4, %ymm4
; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm4
; AVX1-NEXT: vpackssdw %xmm6, %xmm4, %xmm4
; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpmovmskb %xmm4, %eax
@@ -1989,25 +1989,25 @@ define <32 x float> @expandload_v32f32_v32i32(ptr %base, <32 x float> %src0, <32
;
; AVX1-LABEL: expandload_v32f32_v32i32:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm8
-; AVX1-NEXT: vpxor %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpackssdw %xmm8, %xmm5, %xmm5
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm8, %xmm8
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpackssdw %xmm8, %xmm4, %xmm4
+; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5
+; AVX1-NEXT: vxorps %xmm8, %xmm8, %xmm8
+; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm9
+; AVX1-NEXT: vpackssdw %xmm9, %xmm5, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4
+; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm9
+; AVX1-NEXT: vpackssdw %xmm9, %xmm4, %xmm4
; AVX1-NEXT: vpacksswb %xmm5, %xmm4, %xmm4
; AVX1-NEXT: vpmovmskb %xmm4, %ecx
-; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm9, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm9, %xmm5
-; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm9, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm9, %xmm6
-; AVX1-NEXT: vpackssdw %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm7, %ymm4
+; AVX1-NEXT: vcmpeqps %ymm4, %ymm8, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vpackssdw %xmm5, %xmm4, %xmm4
+; AVX1-NEXT: vcvtdq2ps %ymm6, %ymm5
+; AVX1-NEXT: vcmpeqps %ymm5, %ymm8, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
; AVX1-NEXT: vpacksswb %xmm4, %xmm5, %xmm4
; AVX1-NEXT: vpmovmskb %xmm4, %eax
; AVX1-NEXT: shll $16, %eax
diff --git a/llvm/test/CodeGen/X86/masked_gather.ll b/llvm/test/CodeGen/X86/masked_gather.ll
index 559a7ec0930b99..ff311ea67645d1 100644
--- a/llvm/test/CodeGen/X86/masked_gather.ll
+++ b/llvm/test/CodeGen/X86/masked_gather.ll
@@ -1328,14 +1328,12 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
;
; AVX1-LABEL: gather_v8i32_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
-; AVX1-NEXT: vmovmskps %ymm1, %eax
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm1
+; AVX1-NEXT: vxorps %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: vcmpeqps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: # implicit-def: $ymm1
+; AVX1-NEXT: # implicit-def: $ymm0
; AVX1-NEXT: jne .LBB4_1
; AVX1-NEXT: # %bb.2: # %else
; AVX1-NEXT: testb $2, %al
@@ -1359,16 +1357,14 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_16
; AVX1-NEXT: .LBB4_15: # %cond.load19
-; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm3[7]
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
; AVX1-NEXT: .LBB4_16: # %else20
-; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vmovmskps %ymm3, %eax
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm2
+; AVX1-NEXT: vmovmskps %ymm2, %eax
; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: # implicit-def: $ymm3
+; AVX1-NEXT: # implicit-def: $ymm2
; AVX1-NEXT: jne .LBB4_17
; AVX1-NEXT: # %bb.18: # %else26
; AVX1-NEXT: testb $2, %al
@@ -1392,16 +1388,14 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_32
; AVX1-NEXT: .LBB4_31: # %cond.load58
-; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5,6],ymm4[7]
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5,6],ymm3[7]
; AVX1-NEXT: .LBB4_32: # %else61
-; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
-; AVX1-NEXT: vmovmskps %ymm0, %eax
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpeqps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vmovmskps %ymm1, %eax
; AVX1-NEXT: testb $1, %al
-; AVX1-NEXT: # implicit-def: $ymm0
+; AVX1-NEXT: # implicit-def: $ymm1
; AVX1-NEXT: jne .LBB4_33
; AVX1-NEXT: # %bb.34: # %else67
; AVX1-NEXT: testb $2, %al
@@ -1416,125 +1410,125 @@ define <8 x i32> @gather_v8i32_v8i32(<8 x i32> %trigger) {
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB4_42
; AVX1-NEXT: .LBB4_41: # %cond.load84
-; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7]
; AVX1-NEXT: .LBB4_42: # %else87
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB4_44
; AVX1-NEXT: # %bb.43: # %cond.load89
-; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm3[5],ymm0[6,7]
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm2[5],ymm1[6,7]
; AVX1-NEXT: .LBB4_44: # %else92
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm4, %xmm2
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_46
; AVX1-NEXT: # %bb.45: # %cond.load94
; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm3[6],ymm0[7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
; AVX1-NEXT: .LBB4_46: # %else97
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: je .LBB4_48
; AVX1-NEXT: # %bb.47: # %cond.load99
; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5,6],ymm2[7]
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5,6],ymm2[7]
; AVX1-NEXT: .LBB4_48: # %else102
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpaddd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: retq
; AVX1-NEXT: .LBB4_1: # %cond.load
-; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_4
; AVX1-NEXT: .LBB4_3: # %cond.load1
-; AVX1-NEXT: vpinsrd $1, c+12(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vpinsrd $1, c+12(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB4_6
; AVX1-NEXT: .LBB4_5: # %cond.load4
-; AVX1-NEXT: vpinsrd $2, c+12(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vpinsrd $2, c+12(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB4_8
; AVX1-NEXT: .LBB4_7: # %cond.load7
-; AVX1-NEXT: vpinsrd $3, c+12(%rip), %xmm1, %xmm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
+; AVX1-NEXT: vpinsrd $3, c+12(%rip), %xmm0, %xmm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB4_10
; AVX1-NEXT: .LBB4_9: # %cond.load10
-; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3],ymm3[4],ymm1[5,6,7]
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3],ymm2[4],ymm0[5,6,7]
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB4_12
; AVX1-NEXT: .LBB4_11: # %cond.load13
-; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4],ymm3[5],ymm1[6,7]
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4],ymm2[5],ymm0[6,7]
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_14
; AVX1-NEXT: .LBB4_13: # %cond.load16
-; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm3
-; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm1[0,1,2,3,4,5],ymm3[6],ymm1[7]
+; AVX1-NEXT: vbroadcastss c+12(%rip), %ymm2
+; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm2[6],ymm0[7]
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: jne .LBB4_15
; AVX1-NEXT: jmp .LBB4_16
; AVX1-NEXT: .LBB4_17: # %cond.load23
-; AVX1-NEXT: vmovd {{.*#+}} xmm3 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm2 = mem[0],zero,zero,zero
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_20
; AVX1-NEXT: .LBB4_19: # %cond.load28
-; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm3, %xmm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm2, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB4_22
; AVX1-NEXT: .LBB4_21: # %cond.load33
-; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm3, %xmm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm2, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB4_24
; AVX1-NEXT: .LBB4_23: # %cond.load38
-; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm3, %xmm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm4[0,1,2,3],ymm3[4,5,6,7]
+; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm2, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm3[0,1,2,3],ymm2[4,5,6,7]
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: je .LBB4_26
; AVX1-NEXT: .LBB4_25: # %cond.load43
-; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3],ymm4[4],ymm3[5,6,7]
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3],ymm3[4],ymm2[5,6,7]
; AVX1-NEXT: testb $32, %al
; AVX1-NEXT: je .LBB4_28
; AVX1-NEXT: .LBB4_27: # %cond.load48
-; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4],ymm4[5],ymm3[6,7]
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4],ymm3[5],ymm2[6,7]
; AVX1-NEXT: testb $64, %al
; AVX1-NEXT: je .LBB4_30
; AVX1-NEXT: .LBB4_29: # %cond.load53
-; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm4
-; AVX1-NEXT: vblendps {{.*#+}} ymm3 = ymm3[0,1,2,3,4,5],ymm4[6],ymm3[7]
+; AVX1-NEXT: vbroadcastss c+28(%rip), %ymm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm2 = ymm2[0,1,2,3,4,5],ymm3[6],ymm2[7]
; AVX1-NEXT: testb $-128, %al
; AVX1-NEXT: jne .LBB4_31
; AVX1-NEXT: jmp .LBB4_32
; AVX1-NEXT: .LBB4_33: # %cond.load64
-; AVX1-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
+; AVX1-NEXT: vmovd {{.*#+}} xmm1 = mem[0],zero,zero,zero
; AVX1-NEXT: testb $2, %al
; AVX1-NEXT: je .LBB4_36
; AVX1-NEXT: .LBB4_35: # %cond.load69
-; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vpinsrd $1, c+28(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $4, %al
; AVX1-NEXT: je .LBB4_38
; AVX1-NEXT: .LBB4_37: # %cond.load74
-; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vpinsrd $2, c+28(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $8, %al
; AVX1-NEXT: je .LBB4_40
; AVX1-NEXT: .LBB4_39: # %cond.load79
-; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vblendps {{.*#+}} ymm0 = ymm2[0,1,2,3],ymm0[4,5,6,7]
+; AVX1-NEXT: vpinsrd $3, c+28(%rip), %xmm1, %xmm3
+; AVX1-NEXT: vblendps {{.*#+}} ymm1 = ymm3[0,1,2,3],ymm1[4,5,6,7]
; AVX1-NEXT: testb $16, %al
; AVX1-NEXT: jne .LBB4_41
; AVX1-NEXT: jmp .LBB4_42
diff --git a/llvm/test/CodeGen/X86/masked_load.ll b/llvm/test/CodeGen/X86/masked_load.ll
index 89459a2d10177d..d99927ef850528 100644
--- a/llvm/test/CodeGen/X86/masked_load.ll
+++ b/llvm/test/CodeGen/X86/masked_load.ll
@@ -1442,11 +1442,9 @@ define <8 x float> @load_v8f32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x float>
;
; AVX1-LABEL: load_v8f32_v8i32:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovps (%rdi), %ymm0, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/masked_store.ll b/llvm/test/CodeGen/X86/masked_store.ll
index 898b34e969b1d2..be27475b65b79a 100644
--- a/llvm/test/CodeGen/X86/masked_store.ll
+++ b/llvm/test/CodeGen/X86/masked_store.ll
@@ -1509,11 +1509,9 @@ define void @store_v8i32_v8i32(<8 x i32> %trigger, ptr %addr, <8 x i32> %val) {
;
; AVX1-LABEL: store_v8i32_v8i32:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, (%rdi)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -6127,37 +6125,42 @@ define void @store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts(ptr %trigge
;
; AVX1-LABEL: store_v24i32_v24i32_stride6_vf4_only_even_numbered_elts:
; AVX1: ## %bb.0:
-; AVX1-NEXT: vmovdqa (%rsi), %ymm0
-; AVX1-NEXT: vmovaps 32(%rsi), %ymm1
+; AVX1-NEXT: vmovaps (%rsi), %ymm1
+; AVX1-NEXT: vmovdqa 32(%rsi), %ymm0
; AVX1-NEXT: vmovaps 64(%rsi), %ymm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd 48(%rdi), %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtd 32(%rdi), %xmm3, %xmm5
-; AVX1-NEXT: vpackssdw %xmm4, %xmm5, %xmm4
-; AVX1-NEXT: vpacksswb %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtd 80(%rdi), %xmm3, %xmm5
-; AVX1-NEXT: vpcmpgtd 64(%rdi), %xmm3, %xmm6
-; AVX1-NEXT: vpcmpgtd 16(%rdi), %xmm3, %xmm7
-; AVX1-NEXT: vpcmpgtd (%rdi), %xmm3, %xmm8
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm8[0,1],xmm3[2,3],xmm8[4,5],xmm3[6,7]
-; AVX1-NEXT: vpslld $31, %xmm8, %xmm8
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm7[0,1],xmm3[2,3],xmm7[4,5],xmm3[6,7]
-; AVX1-NEXT: vpslld $31, %xmm7, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm8, %ymm7
-; AVX1-NEXT: vmaskmovps %ymm0, %ymm7, (%rdx)
-; AVX1-NEXT: vpblendw {{.*#+}} xmm0 = xmm6[0,1],xmm3[2,3],xmm6[4,5],xmm3[6,7]
-; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm5[0,1],xmm3[2,3],xmm5[4,5],xmm3[6,7]
+; AVX1-NEXT: vcvtdq2ps 32(%rdi), %ymm3
+; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcmpltps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm5
+; AVX1-NEXT: vpackssdw %xmm5, %xmm3, %xmm3
+; AVX1-NEXT: vcvtdq2ps (%rdi), %ymm5
+; AVX1-NEXT: vcmpltps %ymm4, %ymm5, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
+; AVX1-NEXT: vpackssdw %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpacksswb %xmm3, %xmm5, %xmm3
+; AVX1-NEXT: vcvtdq2ps 64(%rdi), %ymm5
+; AVX1-NEXT: vcmpltps %ymm4, %ymm5, %ymm4
+; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm5
+; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vblendps {{.*#+}} xmm4 = xmm4[0],xmm7[1],xmm4[2],xmm7[3]
+; AVX1-NEXT: vpslld $31, %xmm4, %xmm4
+; AVX1-NEXT: vblendps {{.*#+}} xmm5 = xmm5[0],xmm7[1],xmm5[2],xmm7[3]
; AVX1-NEXT: vpslld $31, %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vmaskmovps %ymm2, %ymm0, 64(%rdx)
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
+; AVX1-NEXT: vmaskmovps %ymm2, %ymm4, 64(%rdx)
+; AVX1-NEXT: vpmovzxwq {{.*#+}} xmm2 = xmm3[0],zero,zero,zero,xmm3[1],zero,zero,zero
; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
-; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm0 = xmm0[4,4,5,5,6,6,7,7]
-; AVX1-NEXT: vpslld $31, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm2, %ymm0
-; AVX1-NEXT: vmaskmovps %ymm1, %ymm0, 32(%rdx)
+; AVX1-NEXT: vpblendw {{.*#+}} xmm4 = xmm6[0,1],xmm7[2,3],xmm6[4,5],xmm7[6,7]
+; AVX1-NEXT: vpslld $31, %xmm4, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vmaskmovps %ymm1, %ymm2, (%rdx)
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm3[4],xmm7[4],xmm3[5],xmm7[5],xmm3[6],xmm7[6],xmm3[7],xmm7[7]
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero
+; AVX1-NEXT: vpslld $31, %xmm2, %xmm2
+; AVX1-NEXT: vpunpckhwd {{.*#+}} xmm1 = xmm1[4,4,5,5,6,6,7,7]
+; AVX1-NEXT: vpslld $31, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vmaskmovps %ymm0, %ymm1, 32(%rdx)
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f4a0207dafde7c..7d1687e3733681 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -151,14 +151,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v8i64_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
@@ -372,13 +367,10 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB1_1
; AVX1-NEXT: # %bb.2: # %else
@@ -757,13 +749,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB2_1
; AVX1-NEXT: # %bb.2: # %else
@@ -2204,18 +2193,17 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmovmskb %xmm1, %eax
-; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB9_1
; AVX1-NEXT: # %bb.2: # %else
@@ -2867,18 +2855,17 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpackusdw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmovmskb %xmm1, %eax
-; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB10_1
; AVX1-NEXT: # %bb.2: # %else
@@ -3421,13 +3408,10 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB11_1
; AVX1-NEXT: # %bb.2: # %else
@@ -3795,13 +3779,10 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpshufb %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpshufb %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm2[0],xmm0[1],xmm2[1]
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB12_1
; AVX1-NEXT: # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 487f7298f442c2..bc557f75f02a25 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -281,14 +281,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v8i64_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [2147483647,2147483647]
; AVX1-NEXT: # xmm4 = mem[0,0]
@@ -688,13 +683,10 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB1_1
; AVX1-NEXT: # %bb.2: # %else
@@ -1231,13 +1223,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB2_1
; AVX1-NEXT: # %bb.2: # %else
@@ -3103,18 +3092,17 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmovmskb %xmm1, %eax
-; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB9_1
; AVX1-NEXT: # %bb.2: # %else
@@ -3759,18 +3747,17 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
; AVX1-NEXT: vpackssdw %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmovmskb %xmm1, %eax
-; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB10_1
; AVX1-NEXT: # %bb.2: # %else
@@ -4311,13 +4298,10 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB11_1
; AVX1-NEXT: # %bb.2: # %else
@@ -4683,13 +4667,10 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB12_1
; AVX1-NEXT: # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 498f250f11c690..8215c52aabfbef 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -216,14 +216,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; AVX1-LABEL: truncstore_v8i64_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
; AVX1-NEXT: # xmm3 = mem[0,0]
; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm4
@@ -554,13 +549,10 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm5, %xmm0
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB1_1
; AVX1-NEXT: # %bb.2: # %else
@@ -1027,13 +1019,10 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB2_1
; AVX1-NEXT: # %bb.2: # %else
@@ -2741,18 +2730,17 @@ define void @truncstore_v16i32_v16i16(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-NEXT: vpminud %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmovmskb %xmm1, %eax
-; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB9_1
; AVX1-NEXT: # %bb.2: # %else
@@ -3435,18 +3423,17 @@ define void @truncstore_v16i32_v16i8(<16 x i32> %x, ptr %p, <16 x i32> %mask) {
; AVX1-NEXT: vpminud %xmm5, %xmm1, %xmm1
; AVX1-NEXT: vpackusdw %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm1
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm1
+; AVX1-NEXT: vxorps %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
+; AVX1-NEXT: vpackssdw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vcmpneq_oqps %ymm3, %ymm2, %ymm2
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpacksswb %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpmovmskb %xmm1, %eax
-; AVX1-NEXT: xorl $65535, %eax # imm = 0xFFFF
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB10_1
; AVX1-NEXT: # %bb.2: # %else
@@ -4011,13 +3998,10 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpminud %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB11_1
; AVX1-NEXT: # %bb.2: # %else
@@ -4403,13 +4387,10 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vpminud %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackusdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackuswb %xmm0, %xmm0, %xmm0
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpneq_oqps %ymm2, %ymm1, %ymm1
; AVX1-NEXT: vmovmskps %ymm1, %eax
-; AVX1-NEXT: notl %eax
; AVX1-NEXT: testb $1, %al
; AVX1-NEXT: jne .LBB12_1
; AVX1-NEXT: # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/nontemporal-loads.ll b/llvm/test/CodeGen/X86/nontemporal-loads.ll
index 98d193a79cb747..8754ae5716ae73 100644
--- a/llvm/test/CodeGen/X86/nontemporal-loads.ll
+++ b/llvm/test/CodeGen/X86/nontemporal-loads.ll
@@ -1806,23 +1806,19 @@ define <16 x i32> @test_masked_v16i32(ptr %addr, <16 x i32> %old, <16 x i32> %ma
;
; AVX1-LABEL: test_masked_v16i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3
+; AVX1-NEXT: vxorps %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vcmpneq_oqps %ymm4, %ymm3, %ymm3
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vcmpneq_oqps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vmovntdqa 32(%rdi), %xmm4
; AVX1-NEXT: vmovntdqa 48(%rdi), %xmm5
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm4, %ymm4
-; AVX1-NEXT: vblendvps %ymm3, %ymm1, %ymm4, %ymm1
+; AVX1-NEXT: vblendvps %ymm3, %ymm4, %ymm1, %ymm1
; AVX1-NEXT: vmovntdqa (%rdi), %xmm3
; AVX1-NEXT: vmovntdqa 16(%rdi), %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vblendvps %ymm2, %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vblendvps %ymm2, %ymm3, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: test_masked_v16i32:
diff --git a/llvm/test/CodeGen/X86/pr48215.ll b/llvm/test/CodeGen/X86/pr48215.ll
index 8843a0410a9f79..7ccdcd60b0d100 100644
--- a/llvm/test/CodeGen/X86/pr48215.ll
+++ b/llvm/test/CodeGen/X86/pr48215.ll
@@ -10,18 +10,17 @@ define i32 @PR48215(i32 %a0, i32 %a1) {
; AVX1-NEXT: movl %edi, %eax
; AVX1-NEXT: cltd
; AVX1-NEXT: idivl %esi
-; AVX1-NEXT: vmovd %edx, %xmm0
+; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm1 = [4,5,6,7]
-; AVX1-NEXT: vmovd %eax, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm2 = xmm2[0,0,0,0]
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpmovsxbd {{.*#+}} xmm4 = [0,1,2,3]
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm2
-; AVX1-NEXT: vmovmskps %ymm2, %ecx
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovd %edx, %xmm1
+; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vmovmskps %ymm0, %ecx
+; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
; AVX1-NEXT: vmovmskps %xmm0, %eax
+; AVX1-NEXT: xorl $15, %eax
; AVX1-NEXT: addl %ecx, %eax
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index b2b242fa29818f..bc01f0a8ac52c3 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -861,6 +861,9 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
@@ -1059,6 +1062,9 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vcmpltps %ymm5, %ymm2, %ymm2
; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
@@ -1067,19 +1073,21 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
-; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm6
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm8
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm6
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3
+; AVX1-NEXT: vcmpltps %ymm5, %ymm3, %ymm3
; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vpsrad $31, %xmm6, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm7, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm7, %ymm1
+; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm8, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
diff --git a/llvm/test/CodeGen/X86/setcc-lowering.ll b/llvm/test/CodeGen/X86/setcc-lowering.ll
index 90e5c279d2e17c..30995bd8c523b1 100644
--- a/llvm/test/CodeGen/X86/setcc-lowering.ll
+++ b/llvm/test/CodeGen/X86/setcc-lowering.ll
@@ -10,10 +10,11 @@
define <8 x i16> @pr25080(<8 x i32> %a) {
; AVX1-LABEL: pr25080:
; AVX1: # %bb.0: # %entry
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmpeqps %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index 64aead70415759..f3a19505c92dbc 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -898,23 +898,22 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-LABEL: v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm4, %ymm5
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm1
+; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsrad $31, %xmm4, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm3, %ymm0
+; AVX1-NEXT: vblendvps %ymm0, %ymm1, %ymm5, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i32:
@@ -1111,41 +1110,39 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; AVX1-LABEL: v16i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
+; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm6, %ymm7
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm6, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6
-; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
+; AVX1-NEXT: vcmpltps %ymm2, %ymm5, %ymm2
+; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
+; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: vbroadcastss {{.*#+}} ymm4 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm6, %ymm0
+; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpsubd %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm8
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm6
-; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpgtd %xmm7, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vxorps %ymm1, %ymm5, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5
-; AVX1-NEXT: vpsrad $31, %xmm3, %xmm3
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3
+; AVX1-NEXT: vcmpltps %ymm3, %ymm5, %ymm3
+; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vpsrad $31, %xmm7, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm5, %ymm1
+; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm8, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll
index 67b7eb48e4cb3a..d16202a98bb8cf 100644
--- a/llvm/test/CodeGen/X86/v8i1-masks.ll
+++ b/llvm/test/CodeGen/X86/v8i1-masks.ll
@@ -180,21 +180,17 @@ define void @neg_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
define <8 x i32> @and_mask_constant(<8 x i32> %v0, <8 x i32> %v1) {
; X86-LABEL: and_mask_constant:
; X86: ## %bb.0:
-; X86-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X86-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X86-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; X86-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; X86-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X86-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X86-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X86-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0
; X86-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: and_mask_constant:
; X64: ## %bb.0:
-; X64-NEXT: vextractf128 $1, %ymm0, %xmm1
-; X64-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; X64-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; X64-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; X64-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; X64-NEXT: vcvtdq2ps %ymm0, %ymm0
+; X64-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; X64-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0
; X64-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; X64-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index cee30f5fe5da9e..233f2f4aec9f93 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -308,19 +308,18 @@ define <6 x i32> @saddo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; AVX1-LABEL: saddo_v6i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm4, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: saddo_v6i32:
@@ -376,19 +375,18 @@ define <8 x i32> @saddo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; AVX1-LABEL: saddo_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpaddd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpaddd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpaddd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1
+; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm4, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: saddo_v8i32:
@@ -452,41 +450,43 @@ define <16 x i32> @saddo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-LABEL: saddo_v16i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpaddd %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm5, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpaddd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm6
+; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vcmpltps %ymm7, %ymm6, %ymm6
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8
+; AVX1-NEXT: vpxor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1
-; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm5, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
-; AVX1-NEXT: vpaddd %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8
-; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vxorps %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpaddd %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm8
+; AVX1-NEXT: vcmpltps %ymm7, %ymm8, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vec_ssubo.ll b/llvm/test/CodeGen/X86/vec_ssubo.ll
index 64ed081048851b..88e340d85735f3 100644
--- a/llvm/test/CodeGen/X86/vec_ssubo.ll
+++ b/llvm/test/CodeGen/X86/vec_ssubo.ll
@@ -311,19 +311,18 @@ define <6 x i32> @ssubo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; AVX1-LABEL: ssubo_v6i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vmovq %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm4, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v6i32:
@@ -380,19 +379,18 @@ define <8 x i32> @ssubo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; AVX1-LABEL: ssubo_v8i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
+; AVX1-NEXT: vpsubd %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
-; AVX1-NEXT: vpsubd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm4, %xmm4
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
-; AVX1-NEXT: vxorps %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vcmpltps %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vmovdqa %xmm2, 16(%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, (%rdi)
+; AVX1-NEXT: vmovdqa %xmm4, (%rdi)
; AVX1-NEXT: retq
;
; AVX2-LABEL: ssubo_v8i32:
@@ -457,41 +455,43 @@ define <16 x i32> @ssubo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-LABEL: ssubo_v16i32:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm4
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm4, %xmm6
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsubd %xmm4, %xmm7, %xmm4
-; AVX1-NEXT: vpcmpgtd %xmm4, %xmm7, %xmm7
-; AVX1-NEXT: vpxor %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm3, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpsubd %xmm4, %xmm5, %xmm4
+; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm6
+; AVX1-NEXT: vxorps %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vcmpltps %ymm6, %ymm7, %ymm6
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8
+; AVX1-NEXT: vpxor %xmm5, %xmm8, %xmm5
; AVX1-NEXT: vpsubd %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm7, %xmm1
-; AVX1-NEXT: vpackssdw %xmm6, %xmm1, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm7
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm8
-; AVX1-NEXT: vpsubd %xmm6, %xmm8, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm6, %xmm8, %xmm8
-; AVX1-NEXT: vpxor %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpcmpgtd %xmm5, %xmm2, %xmm5
+; AVX1-NEXT: vxorps %xmm1, %xmm6, %xmm1
+; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm6
+; AVX1-NEXT: vpsubd %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm5, %xmm6, %xmm6
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm8
+; AVX1-NEXT: vcmpltps %ymm8, %ymm7, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm8
+; AVX1-NEXT: vpxor %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vpsubd %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpackssdw %xmm7, %xmm0, %xmm0
+; AVX1-NEXT: vxorps %xmm0, %xmm7, %xmm0
+; AVX1-NEXT: vpackssdw %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpmovsxbd %xmm0, %xmm5
+; AVX1-NEXT: vpmovsxbd %xmm0, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm5, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm6, %ymm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm1, %xmm1
-; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
+; AVX1-NEXT: vpmovsxbd %xmm1, %xmm6
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm5, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm6, %ymm1
; AVX1-NEXT: vmovdqa %xmm4, 48(%rdi)
; AVX1-NEXT: vmovdqa %xmm3, 32(%rdi)
-; AVX1-NEXT: vmovdqa %xmm6, 16(%rdi)
+; AVX1-NEXT: vmovdqa %xmm5, 16(%rdi)
; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
; AVX1-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vec_umulo.ll b/llvm/test/CodeGen/X86/vec_umulo.ll
index 6311678924d06a..9e19a25522aedf 100644
--- a/llvm/test/CodeGen/X86/vec_umulo.ll
+++ b/llvm/test/CodeGen/X86/vec_umulo.ll
@@ -517,19 +517,16 @@ define <6 x i32> @umulo_v6i32(<6 x i32> %a0, <6 x i32> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vcmpneq_oqps %ymm5, %ymm2, %ymm2
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1
; AVX1-NEXT: vmovq %xmm1, 16(%rdi)
@@ -689,19 +686,16 @@ define <8 x i32> @umulo_v8i32(<8 x i32> %a0, <8 x i32> %a1, ptr %p2) nounwind {
; AVX1-NEXT: vpmuludq %xmm3, %xmm4, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3],xmm5[4,5],xmm2[6,7]
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm6, %xmm6, %xmm6
-; AVX1-NEXT: vpxor %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm0[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7
-; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm8
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpxor %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpshufd {{.*#+}} xmm5 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm0[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpmuludq %xmm1, %xmm0, %xmm6
+; AVX1-NEXT: vpshufd {{.*#+}} xmm6 = xmm6[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm5 = xmm6[0,1],xmm5[2,3],xmm6[4,5],xmm5[6,7]
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm5, %ymm2
+; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
+; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vcmpneq_oqps %ymm5, %ymm2, %ymm2
; AVX1-NEXT: vpmulld %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmulld %xmm3, %xmm4, %xmm1
; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi)
@@ -937,16 +931,18 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpmuludq %xmm4, %xmm5, %xmm7
; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm7[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm6 = xmm7[0,1],xmm6[2,3],xmm7[4,5],xmm6[6,7]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm7 = xmm3[1,1,3,3]
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm1[1,1,3,3]
+; AVX1-NEXT: vpmuludq %xmm7, %xmm8, %xmm7
+; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm8
+; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm8[1,1,3,3]
+; AVX1-NEXT: vpblendw {{.*#+}} xmm7 = xmm8[0,1],xmm7[2,3],xmm8[4,5],xmm7[6,7]
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
+; AVX1-NEXT: vcvtdq2ps %ymm6, %ymm6
; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpshufd {{.*#+}} xmm8 = xmm3[1,1,3,3]
-; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm1[1,1,3,3]
-; AVX1-NEXT: vpmuludq %xmm8, %xmm9, %xmm8
-; AVX1-NEXT: vpmuludq %xmm3, %xmm1, %xmm9
-; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm9[1,1,3,3]
-; AVX1-NEXT: vpblendw {{.*#+}} xmm8 = xmm9[0,1],xmm8[2,3],xmm9[4,5],xmm8[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm8, %xmm8
-; AVX1-NEXT: vpackssdw %xmm6, %xmm8, %xmm6
+; AVX1-NEXT: vcmpneq_oqps %ymm7, %ymm6, %ymm6
+; AVX1-NEXT: vextractf128 $1, %ymm6, %xmm8
+; AVX1-NEXT: vpackssdw %xmm8, %xmm6, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm8
; AVX1-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[1,1,3,3]
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm10
@@ -955,18 +951,18 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpmuludq %xmm8, %xmm10, %xmm11
; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm11[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm9 = xmm11[0,1],xmm9[2,3],xmm11[4,5],xmm9[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm9, %xmm9
; AVX1-NEXT: vpshufd {{.*#+}} xmm11 = xmm2[1,1,3,3]
; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm0[1,1,3,3]
; AVX1-NEXT: vpmuludq %xmm11, %xmm12, %xmm11
; AVX1-NEXT: vpmuludq %xmm2, %xmm0, %xmm12
; AVX1-NEXT: vpshufd {{.*#+}} xmm12 = xmm12[1,1,3,3]
; AVX1-NEXT: vpblendw {{.*#+}} xmm11 = xmm12[0,1],xmm11[2,3],xmm12[4,5],xmm11[6,7]
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm11, %xmm7
+; AVX1-NEXT: vinsertf128 $1, %xmm9, %ymm11, %ymm9
+; AVX1-NEXT: vcvtdq2ps %ymm9, %ymm9
+; AVX1-NEXT: vcmpneq_oqps %ymm7, %ymm9, %ymm7
+; AVX1-NEXT: vextractf128 $1, %ymm7, %xmm9
; AVX1-NEXT: vpackssdw %xmm9, %xmm7, %xmm7
; AVX1-NEXT: vpacksswb %xmm6, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm9, %xmm9, %xmm9
-; AVX1-NEXT: vpxor %xmm7, %xmm9, %xmm7
; AVX1-NEXT: vpmulld %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpmulld %xmm8, %xmm10, %xmm8
; AVX1-NEXT: vpmulld %xmm3, %xmm1, %xmm3
@@ -976,7 +972,6 @@ define <16 x i32> @umulo_v16i32(<16 x i32> %a0, <16 x i32> %a1, ptr %p2) nounwin
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vpacksswb %xmm6, %xmm6, %xmm1
-; AVX1-NEXT: vpxor %xmm1, %xmm9, %xmm1
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm5
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,1,1]
; AVX1-NEXT: vpmovsxbd %xmm1, %xmm1
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index 0adb9ddfc426a8..b87a536e0fa25e 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -7745,19 +7745,15 @@ define <16 x float> @vpaddd_mask_test(<16 x float> %i, <16 x float> %j, <16 x i3
;
; AVX1-LABEL: vpaddd_mask_test:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm5, %xmm6
-; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm5, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
-; AVX1-NEXT: vextractf128 $1, %ymm4, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpeqd %xmm7, %xmm4, %xmm4
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm4, %ymm4
+; AVX1-NEXT: vcvtdq2ps %ymm5, %ymm5
+; AVX1-NEXT: vxorps %xmm6, %xmm6, %xmm6
+; AVX1-NEXT: vcmpneq_oqps %ymm6, %ymm5, %ymm5
+; AVX1-NEXT: vcvtdq2ps %ymm4, %ymm4
+; AVX1-NEXT: vcmpneq_oqps %ymm6, %ymm4, %ymm4
; AVX1-NEXT: vaddps %ymm3, %ymm1, %ymm3
; AVX1-NEXT: vaddps %ymm2, %ymm0, %ymm2
-; AVX1-NEXT: vblendvps %ymm4, %ymm0, %ymm2, %ymm0
-; AVX1-NEXT: vblendvps %ymm5, %ymm1, %ymm3, %ymm1
+; AVX1-NEXT: vblendvps %ymm4, %ymm2, %ymm0, %ymm0
+; AVX1-NEXT: vblendvps %ymm5, %ymm3, %ymm1, %ymm1
; AVX1-NEXT: retq
;
; AVX512-LABEL: vpaddd_mask_test:
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index 5b43acbe523757..fbd7df1c839e97 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -1048,11 +1048,9 @@ define <8 x i32> @is_positive_mask_v8i32(<8 x i32> %x, <8 x i32> %y) {
;
; AVX1-LABEL: is_positive_mask_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vandps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
@@ -1310,11 +1308,9 @@ define <8 x i32> @is_positive_mask_load_v8i32(<8 x i32> %x, ptr %p) {
;
; AVX1-LABEL: is_positive_mask_load_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vandps (%rdi), %ymm0, %ymm0
; AVX1-NEXT: retq
;
@@ -1664,10 +1660,10 @@ define <8 x i1> @is_positive_mask_v8i32_v8i1(<8 x i32> %x, <8 x i1> %y) {
;
; AVX1-LABEL: is_positive_mask_v8i32_v8i1:
; AVX1: # %bb.0:
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm2 = [-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0,-1.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpand %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
index 05854ff728a077..fad3eb33400ea2 100644
--- a/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
+++ b/llvm/test/CodeGen/X86/vector-popcnt-256-ult-ugt.ll
@@ -3304,16 +3304,13 @@ define <8 x i32> @ugt_1_v8i32(<8 x i32> %0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmpneq_oqps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_1_v8i32:
@@ -3372,14 +3369,13 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) {
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm3
-; AVX1-NEXT: vpand %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm2, %xmm0, %xmm2
-; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_2_v8i32:
@@ -3432,36 +3428,36 @@ define <8 x i32> @ult_2_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_2_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [2,2,2,2]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0,2.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_2_v8i32:
@@ -3535,36 +3531,35 @@ define <8 x i32> @ugt_2_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_3_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_3_v8i32:
@@ -3638,36 +3633,36 @@ define <8 x i32> @ult_3_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_3_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [3,3,3,3]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0,3.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_3_v8i32:
@@ -3741,36 +3736,35 @@ define <8 x i32> @ugt_3_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_4_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_4_v8i32:
@@ -3844,36 +3838,36 @@ define <8 x i32> @ult_4_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_4_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [4,4,4,4]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0,4.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_4_v8i32:
@@ -3947,36 +3941,35 @@ define <8 x i32> @ugt_4_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_5_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_5_v8i32:
@@ -4050,36 +4043,36 @@ define <8 x i32> @ult_5_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_5_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [5,5,5,5]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0,5.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_5_v8i32:
@@ -4153,36 +4146,35 @@ define <8 x i32> @ugt_5_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_6_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_6_v8i32:
@@ -4256,36 +4248,36 @@ define <8 x i32> @ult_6_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_6_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [6,6,6,6]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0,6.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_6_v8i32:
@@ -4359,36 +4351,35 @@ define <8 x i32> @ugt_6_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_7_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_7_v8i32:
@@ -4462,36 +4453,36 @@ define <8 x i32> @ult_7_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_7_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [7,7,7,7]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0,7.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_7_v8i32:
@@ -4565,36 +4556,35 @@ define <8 x i32> @ugt_7_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_8_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
+; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_8_v8i32:
@@ -4668,36 +4658,36 @@ define <8 x i32> @ult_8_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_8_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [8,8,8,8]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0,8.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_8_v8i32:
@@ -4771,36 +4761,35 @@ define <8 x i32> @ugt_8_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_9_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_9_v8i32:
@@ -4874,36 +4863,36 @@ define <8 x i32> @ult_9_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_9_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [9,9,9,9]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0,9.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_9_v8i32:
@@ -4977,36 +4966,35 @@ define <8 x i32> @ugt_9_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_10_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_10_v8i32:
@@ -5080,36 +5068,36 @@ define <8 x i32> @ult_10_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_10_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [10,10,10,10]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1,1.0E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_10_v8i32:
@@ -5183,36 +5171,35 @@ define <8 x i32> @ugt_10_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_11_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_11_v8i32:
@@ -5286,36 +5273,36 @@ define <8 x i32> @ult_11_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_11_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [11,11,11,11]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1,1.1E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_11_v8i32:
@@ -5389,36 +5376,35 @@ define <8 x i32> @ugt_11_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_12_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_12_v8i32:
@@ -5492,36 +5478,36 @@ define <8 x i32> @ult_12_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_12_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [12,12,12,12]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1,1.2E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_12_v8i32:
@@ -5595,36 +5581,35 @@ define <8 x i32> @ugt_12_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_13_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_13_v8i32:
@@ -5698,36 +5683,36 @@ define <8 x i32> @ult_13_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_13_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [13,13,13,13]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1,1.3E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_13_v8i32:
@@ -5801,36 +5786,35 @@ define <8 x i32> @ugt_13_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_14_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_14_v8i32:
@@ -5904,36 +5888,36 @@ define <8 x i32> @ult_14_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_14_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [14,14,14,14]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1,1.4E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_14_v8i32:
@@ -6007,36 +5991,35 @@ define <8 x i32> @ugt_14_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_15_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_15_v8i32:
@@ -6110,36 +6093,36 @@ define <8 x i32> @ult_15_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_15_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1,1.5E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_15_v8i32:
@@ -6213,36 +6196,35 @@ define <8 x i32> @ugt_15_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_16_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_16_v8i32:
@@ -6316,36 +6298,36 @@ define <8 x i32> @ult_16_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_16_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [16,16,16,16]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1,1.6E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_16_v8i32:
@@ -6419,36 +6401,35 @@ define <8 x i32> @ugt_16_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_17_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_17_v8i32:
@@ -6522,36 +6503,36 @@ define <8 x i32> @ult_17_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_17_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [17,17,17,17]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1,1.7E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_17_v8i32:
@@ -6625,36 +6606,35 @@ define <8 x i32> @ugt_17_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_18_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_18_v8i32:
@@ -6728,36 +6708,36 @@ define <8 x i32> @ult_18_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_18_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [18,18,18,18]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1,1.8E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_18_v8i32:
@@ -6831,36 +6811,35 @@ define <8 x i32> @ugt_18_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_19_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_19_v8i32:
@@ -6934,36 +6913,36 @@ define <8 x i32> @ult_19_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_19_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [19,19,19,19]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1,1.9E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_19_v8i32:
@@ -7037,36 +7016,35 @@ define <8 x i32> @ugt_19_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_20_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_20_v8i32:
@@ -7140,36 +7118,36 @@ define <8 x i32> @ult_20_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_20_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [20,20,20,20]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1,2.0E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_20_v8i32:
@@ -7243,36 +7221,35 @@ define <8 x i32> @ugt_20_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_21_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_21_v8i32:
@@ -7346,36 +7323,36 @@ define <8 x i32> @ult_21_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_21_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [21,21,21,21]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1,2.1E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_21_v8i32:
@@ -7449,36 +7426,35 @@ define <8 x i32> @ugt_21_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_22_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_22_v8i32:
@@ -7552,36 +7528,36 @@ define <8 x i32> @ult_22_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_22_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [22,22,22,22]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1,2.2E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_22_v8i32:
@@ -7655,36 +7631,35 @@ define <8 x i32> @ugt_22_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_23_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_23_v8i32:
@@ -7758,36 +7733,36 @@ define <8 x i32> @ult_23_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_23_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [23,23,23,23]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1,2.3E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_23_v8i32:
@@ -7861,36 +7836,35 @@ define <8 x i32> @ugt_23_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_24_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_24_v8i32:
@@ -7964,36 +7938,36 @@ define <8 x i32> @ult_24_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_24_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [24,24,24,24]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1,2.4E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_24_v8i32:
@@ -8067,36 +8041,35 @@ define <8 x i32> @ugt_24_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_25_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_25_v8i32:
@@ -8170,36 +8143,36 @@ define <8 x i32> @ult_25_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_25_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [25,25,25,25]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1,2.5E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_25_v8i32:
@@ -8273,36 +8246,35 @@ define <8 x i32> @ugt_25_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_26_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_26_v8i32:
@@ -8376,36 +8348,36 @@ define <8 x i32> @ult_26_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_26_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [26,26,26,26]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1,2.6E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_26_v8i32:
@@ -8479,36 +8451,35 @@ define <8 x i32> @ugt_26_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_27_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_27_v8i32:
@@ -8582,36 +8553,36 @@ define <8 x i32> @ult_27_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_27_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [27,27,27,27]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1,2.7E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_27_v8i32:
@@ -8685,36 +8656,35 @@ define <8 x i32> @ugt_27_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_28_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_28_v8i32:
@@ -8788,36 +8758,36 @@ define <8 x i32> @ult_28_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_28_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [28,28,28,28]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1,2.8E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_28_v8i32:
@@ -8891,36 +8861,35 @@ define <8 x i32> @ugt_28_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_29_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_29_v8i32:
@@ -8994,36 +8963,36 @@ define <8 x i32> @ult_29_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_29_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [29,29,29,29]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1,2.9E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_29_v8i32:
@@ -9097,36 +9066,35 @@ define <8 x i32> @ugt_29_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_30_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_30_v8i32:
@@ -9200,36 +9168,36 @@ define <8 x i32> @ult_30_v8i32(<8 x i32> %0) {
define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ugt_30_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [30,30,30,30]
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm2, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vbroadcastss {{.*#+}} ymm1 = [3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1,3.0E+1]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ugt_30_v8i32:
@@ -9303,36 +9271,35 @@ define <8 x i32> @ugt_30_v8i32(<8 x i32> %0) {
define <8 x i32> @ult_31_v8i32(<8 x i32> %0) {
; AVX1-LABEL: ult_31_v8i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa {{.*#+}} xmm3 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
-; AVX1-NEXT: vpshufb %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm4
-; AVX1-NEXT: vpand %xmm1, %xmm4, %xmm4
-; AVX1-NEXT: vpshufb %xmm4, %xmm3, %xmm4
-; AVX1-NEXT: vpaddb %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm5, %xmm5
-; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm2 = xmm2[0],zero,xmm2[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpackuswb %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm5
-; AVX1-NEXT: vpshufb %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
+; AVX1-NEXT: vbroadcastss {{.*#+}} xmm2 = [15,15,15,15,15,15,15,15,15,15,15,15,15,15,15,15]
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vmovdqa {{.*#+}} xmm4 = [0,1,1,2,1,2,2,3,1,2,2,3,2,3,3,4]
+; AVX1-NEXT: vpshufb %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpsrlw $4, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpshufb %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vpaddb %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm1[2],xmm3[2],xmm1[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm5, %xmm5
+; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm1 = xmm1[0],zero,xmm1[1],zero
+; AVX1-NEXT: vpsadbw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vpackuswb %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm5
+; AVX1-NEXT: vpshufb %xmm5, %xmm4, %xmm5
; AVX1-NEXT: vpsrlw $4, %xmm0, %xmm0
-; AVX1-NEXT: vpand %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpshufb %xmm0, %xmm3, %xmm0
+; AVX1-NEXT: vpand %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpshufb %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpaddb %xmm5, %xmm0, %xmm0
-; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm1 = xmm0[2],xmm4[2],xmm0[3],xmm4[3]
-; AVX1-NEXT: vpsadbw %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vpunpckhdq {{.*#+}} xmm2 = xmm0[2],xmm3[2],xmm0[3],xmm3[3]
+; AVX1-NEXT: vpsadbw %xmm3, %xmm2, %xmm2
; AVX1-NEXT: vpmovzxdq {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero
-; AVX1-NEXT: vpsadbw %xmm4, %xmm0, %xmm0
-; AVX1-NEXT: vpackuswb %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vbroadcastss {{.*#+}} xmm1 = [31,31,31,31]
-; AVX1-NEXT: vpcmpgtd %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vpsadbw %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vpackuswb %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: ult_31_v8i32:
diff --git a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
index ec41657d2f248f..301f98a84203a5 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-fmaximum.ll
@@ -816,64 +816,6 @@ define float @test_v16f32(<16 x float> %a0) {
; SSE41-NEXT: orps %xmm4, %xmm0
; SSE41-NEXT: retq
;
-; AVX-LABEL: test_v16f32:
-; AVX: # %bb.0:
-; AVX-NEXT: vblendvps %ymm0, %ymm1, %ymm0, %ymm2
-; AVX-NEXT: vblendvps %ymm0, %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vmaxps %ymm2, %ymm0, %ymm1
-; AVX-NEXT: vcmpunordps %ymm0, %ymm0, %ymm2
-; AVX-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
-; AVX-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX-NEXT: vblendvps %xmm0, %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm0, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmaxps %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordps %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX-NEXT: vmovd %xmm0, %eax
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB4_1
-; AVX-NEXT: # %bb.2:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: jmp .LBB4_3
-; AVX-NEXT: .LBB4_1:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: vmovaps %xmm0, %xmm1
-; AVX-NEXT: .LBB4_3:
-; AVX-NEXT: vmaxss %xmm2, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
-; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm2
-; AVX-NEXT: vmovd %xmm2, %eax
-; AVX-NEXT: vshufpd {{.*#+}} xmm1 = xmm0[1,0]
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB4_4
-; AVX-NEXT: # %bb.5:
-; AVX-NEXT: vmovaps %xmm2, %xmm3
-; AVX-NEXT: jmp .LBB4_6
-; AVX-NEXT: .LBB4_4:
-; AVX-NEXT: vmovapd %xmm1, %xmm3
-; AVX-NEXT: vmovaps %xmm2, %xmm1
-; AVX-NEXT: .LBB4_6:
-; AVX-NEXT: vmaxss %xmm3, %xmm1, %xmm2
-; AVX-NEXT: vcmpunordss %xmm1, %xmm1, %xmm3
-; AVX-NEXT: vblendvps %xmm3, %xmm1, %xmm2, %xmm1
-; AVX-NEXT: vmovd %xmm1, %eax
-; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; AVX-NEXT: testl %eax, %eax
-; AVX-NEXT: js .LBB4_7
-; AVX-NEXT: # %bb.8:
-; AVX-NEXT: vmovaps %xmm1, %xmm2
-; AVX-NEXT: jmp .LBB4_9
-; AVX-NEXT: .LBB4_7:
-; AVX-NEXT: vmovaps %xmm0, %xmm2
-; AVX-NEXT: vmovaps %xmm1, %xmm0
-; AVX-NEXT: .LBB4_9:
-; AVX-NEXT: vmaxss %xmm2, %xmm0, %xmm1
-; AVX-NEXT: vcmpunordss %xmm0, %xmm0, %xmm2
-; AVX-NEXT: vblendvps %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
; AVX512BW-LABEL: test_v16f32:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vextractf64x4 $1, %zmm0, %ymm1
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
index f80544fdef7e60..1d272c8e44f584 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-bool.ll
@@ -950,12 +950,10 @@ define i1 @icmp0_v8i32_v8i1(<8 x i32>) nounwind {
;
; AVX1-LABEL: icmp0_v8i32_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vpor %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vtestps %xmm0, %xmm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0
+; AVX1-NEXT: vtestps %ymm0, %ymm0
; AVX1-NEXT: setne %al
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -1301,14 +1299,14 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
;
; AVX1-LABEL: icmp0_v16i32_v16i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
index 80b4f4614383f6..87cbf55fee30a9 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-xor-bool.ll
@@ -1329,11 +1329,9 @@ define i1 @icmp0_v8i32_v8i1(<8 x i32>) nounwind {
;
; AVX1-LABEL: icmp0_v8i32_v8i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
-; AVX1-NEXT: vpcmpeqd %xmm2, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vxorps %xmm1, %xmm1, %xmm1
+; AVX1-NEXT: vcmpeqps %ymm1, %ymm0, %ymm0
; AVX1-NEXT: vmovmskps %ymm0, %eax
; AVX1-NEXT: testb %al, %al
; AVX1-NEXT: setnp %al
@@ -1721,14 +1719,14 @@ define i1 @icmp0_v16i32_v16i1(<16 x i32>) nounwind {
;
; AVX1-LABEL: icmp0_v16i32_v16i1:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpackssdw %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
+; AVX1-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm1, %ymm1
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm3
+; AVX1-NEXT: vpackssdw %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps %ymm2, %ymm0, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpeqd %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpmovmskb %xmm0, %eax
diff --git a/llvm/test/CodeGen/X86/vector-sext.ll b/llvm/test/CodeGen/X86/vector-sext.ll
index 85c1e25c29ed5b..dc9e69137a8a7e 100644
--- a/llvm/test/CodeGen/X86/vector-sext.ll
+++ b/llvm/test/CodeGen/X86/vector-sext.ll
@@ -2419,12 +2419,9 @@ define <8 x i32> @load_sext_8i1_to_8i32(ptr%ptr) {
; AVX1-NEXT: vmovd %eax, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
-; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [1,2,4,8,16,32,64,128]
-; AVX1-NEXT: vandps %ymm1, %ymm0, %ymm0
-; AVX1-NEXT: vpcmpeqd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
-; AVX1-NEXT: vpcmpeqd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX1-NEXT: vandps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpeqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: load_sext_8i1_to_8i32:
diff --git a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
index 9fd5b9010b0cf8..0eb2221e333d29 100644
--- a/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-unsigned-cmp.ll
@@ -679,15 +679,16 @@ define void @PR54171(ptr %mask0, ptr %mask1, i64 %i) {
; AVX1-NEXT: # %bb.1: # %if.then
; AVX1-NEXT: vmovd %edx, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,0,0,0]
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vmovdqa %xmm2, (%rdi)
-; AVX1-NEXT: vmovdqa %xmm1, 16(%rdi)
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; AVX1-NEXT: vmovdqa %xmm0, (%rsi)
-; AVX1-NEXT: vmovdqa %xmm1, 16(%rsi)
+; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm0, %ymm0
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [0.0E+0,0.0E+0,1.0E+0,1.0E+0,2.0E+0,2.0E+0,3.0E+0,3.0E+0]
+; AVX1-NEXT: vcvtdq2ps %ymm0, %ymm0
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm1
+; AVX1-NEXT: vmovaps %ymm1, (%rdi)
+; AVX1-NEXT: vmovaps {{.*#+}} ymm1 = [4.0E+0,4.0E+0,5.0E+0,5.0E+0,6.0E+0,6.0E+0,7.0E+0,7.0E+0]
+; AVX1-NEXT: vcmpltps %ymm0, %ymm1, %ymm0
+; AVX1-NEXT: vmovaps %ymm0, (%rsi)
; AVX1-NEXT: .LBB18_2: # %if.end
+; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR54171:
>From 7e829779837073303b9db6e553bafcf4ff8dcc19 Mon Sep 17 00:00:00 2001
From: Noah Goldstein <goldstein.w.n at gmail.com>
Date: Thu, 7 Mar 2024 11:40:15 -0600
Subject: [PATCH 3/3] [X86] Improve helper for simplifying demanded bits of
compares
We currently only handle a single case for `pcmpgt`. This patch
extends that to work for `cmpp` and handles comparitors more
generically.
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 197 ++++++-
llvm/lib/Target/X86/X86InstrInfo.cpp | 40 ++
llvm/lib/Target/X86/X86InstrInfo.h | 3 +
llvm/test/CodeGen/X86/combine-testps.ll | 25 +-
llvm/test/CodeGen/X86/fpclamptosat_vec.ll | 108 ++--
llvm/test/CodeGen/X86/i64-to-float.ll | 4 +-
.../CodeGen/X86/masked_store_trunc_ssat.ll | 530 ++++++++----------
.../CodeGen/X86/masked_store_trunc_usat.ll | 80 +--
llvm/test/CodeGen/X86/pr81136.ll | 16 +-
llvm/test/CodeGen/X86/sadd_sat_vec.ll | 26 +-
llvm/test/CodeGen/X86/sat-add.ll | 13 +-
.../CodeGen/X86/srem-seteq-vec-nonsplat.ll | 111 ++--
llvm/test/CodeGen/X86/ssub_sat_vec.ll | 418 +++++++-------
llvm/test/CodeGen/X86/var-permute-256.ll | 4 +-
llvm/test/CodeGen/X86/vector-pcmp.ll | 29 +-
llvm/test/CodeGen/X86/vector-trunc-ssat.ll | 317 ++++-------
llvm/test/CodeGen/X86/vector-trunc-usat.ll | 82 +--
llvm/test/CodeGen/X86/vsel-cmp-load.ll | 12 +-
18 files changed, 947 insertions(+), 1068 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index bb65a42ffd7b69..240388657511fb 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -41293,6 +41293,154 @@ static SDValue combineShuffle(SDNode *N, SelectionDAG &DAG,
return SDValue();
}
+// Simplify a decomposed (sext (setcc)). Assumes prior check that
+// bitwidth(sext)==bitwidth(setcc operands).
+static SDValue simplifySExtOfDecomposedSetCCImpl(
+ SelectionDAG &DAG, SDLoc &DL, ISD::CondCode CC, SDValue Op0, SDValue Op1,
+ const APInt &OriginalDemandedBits, const APInt &OriginalDemandedElts,
+ bool AllowNOT, unsigned Depth) {
+ // Possible TODO: We could handle any power of two demanded bit + unsigned
+ // comparison. There are no x86 specific comparisons that are unsigned so its
+ // unneeded.
+ if (!OriginalDemandedBits.isSignMask())
+ return SDValue();
+
+ EVT OpVT = Op0.getValueType();
+ // We need need nofpclass(nan inf nzero) to handle floats.
+ auto hasOkayFPFlags = [](SDValue Op) {
+ return Op->getFlags().hasNoNaNs() && Op->getFlags().hasNoInfs() &&
+ Op->getFlags().hasNoSignedZeros();
+ };
+
+ if (OpVT.isFloatingPoint() && !hasOkayFPFlags(Op0))
+ return SDValue();
+
+ auto ValsEq = [OpVT](const APInt &V0, APInt V1) -> bool {
+ if (OpVT.isFloatingPoint()) {
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(OpVT);
+ return V0.eq(APFloat(Sem, V1).bitcastToAPInt());
+ }
+ return V0.eq(V1);
+ };
+
+ // Assume we canonicalized constants to Op1. That isn't always true but we
+ // call this function twice with inverted CC/Operands so its fine either way.
+ APInt Op1C;
+ unsigned ValWidth = OriginalDemandedBits.getBitWidth();
+ if (ISD::isConstantSplatVectorAllZeros(Op1.getNode())) {
+ Op1C = APInt::getZero(ValWidth);
+ } else if (ISD::isConstantSplatVectorAllOnes(Op1.getNode())) {
+ Op1C = APInt::getAllOnes(ValWidth);
+ } else if (auto *C = dyn_cast<ConstantFPSDNode>(Op1)) {
+ Op1C = C->getValueAPF().bitcastToAPInt();
+ } else if (auto *C = dyn_cast<ConstantSDNode>(Op1)) {
+ Op1C = C->getAPIntValue();
+ } else if (ISD::isConstantSplatVector(Op1.getNode(), Op1C)) {
+ // isConstantSplatVector sets `Op1C`.
+ } else {
+ return SDValue();
+ }
+
+ bool Not = false;
+ bool Okay = false;
+ assert(OriginalDemandedBits.getBitWidth() == Op1C.getBitWidth() &&
+ "Invalid constant operand");
+
+ switch (CC) {
+ case ISD::SETGE:
+ case ISD::SETOGE:
+ Not = true;
+ [[fallthrough]];
+ case ISD::SETLT:
+ case ISD::SETOLT:
+ // signbit(sext(x s< 0)) == signbit(x)
+ // signbit(sext(x s>= 0)) == signbit(~x)
+ Okay = ValsEq(Op1C, APInt::getZero(ValWidth));
+ // For float ops we need to ensure Op0 is de-norm. Otherwise DAZ can break
+ // this fold.
+ // NB: We only need de-norm check here, for the rest of the constants any
+ // relationship with a de-norm value and zero will be identical.
+ if (Okay && OpVT.isFloatingPoint()) {
+ // Values from integers are always normal.
+ if (Op0.getOpcode() == ISD::SINT_TO_FP ||
+ Op0.getOpcode() == ISD::UINT_TO_FP)
+ break;
+
+ // See if we can prove normal with known bits.
+ KnownBits Op0Known =
+ DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth);
+ // Negative/positive doesn't matter.
+ Op0Known.One.clearSignBit();
+ Op0Known.Zero.clearSignBit();
+
+ // Get min normal value.
+ const fltSemantics &Sem = SelectionDAG::EVTToAPFloatSemantics(OpVT);
+ KnownBits MinNormal = KnownBits::makeConstant(
+ APFloat::getSmallestNormalized(Sem).bitcastToAPInt());
+ // Are we above de-norm range?
+ std::optional<bool> Op0Normal = KnownBits::uge(Op0Known, MinNormal);
+ Okay = Op0Normal.has_value() && *Op0Normal;
+ }
+ break;
+ case ISD::SETGT:
+ case ISD::SETOGT:
+ Not = true;
+ [[fallthrough]];
+ case ISD::SETLE:
+ case ISD::SETOLE:
+ // signbit(sext(x s<= -1)) == signbit(x)
+ // signbit(sext(x s> -1)) == signbit(~x)
+ Okay = ValsEq(Op1C, APInt::getAllOnes(ValWidth));
+ break;
+ case ISD::SETULT:
+ Not = true;
+ [[fallthrough]];
+ case ISD::SETUGE:
+ // signbit(sext(x u>= SIGNED_MIN)) == signbit(x)
+ // signbit(sext(x u< SIGNED_MIN)) == signbit(~x)
+ Okay = !OpVT.isFloatingPoint() && ValsEq(Op1C, OriginalDemandedBits);
+ break;
+ case ISD::SETULE:
+ Not = true;
+ [[fallthrough]];
+ case ISD::SETUGT:
+ // signbit(sext(x u> SIGNED_MAX)) == signbit(x)
+ // signbit(sext(x u<= SIGNED_MAX)) == signbit(~x)
+ Okay = !OpVT.isFloatingPoint() && ValsEq(Op1C, OriginalDemandedBits - 1);
+ break;
+ default:
+ break;
+ }
+
+ Okay = Not ? AllowNOT : Okay;
+ if (!Okay)
+ return SDValue();
+
+ if (!Not)
+ return Op0;
+
+ if (!OpVT.isFloatingPoint())
+ return DAG.getNOT(DL, Op0, OpVT);
+
+ // Possible TODO: We could use `fneg` to do not.
+ return SDValue();
+}
+
+static SDValue simplifySExtOfDecomposedSetCC(SelectionDAG &DAG, SDLoc &DL,
+ ISD::CondCode CC, SDValue Op0,
+ SDValue Op1,
+ const APInt &OriginalDemandedBits,
+ const APInt &OriginalDemandedElts,
+ bool AllowNOT, unsigned Depth) {
+ if (SDValue R = simplifySExtOfDecomposedSetCCImpl(
+ DAG, DL, CC, Op0, Op1, OriginalDemandedBits, OriginalDemandedElts,
+ AllowNOT, Depth))
+ return R;
+ return simplifySExtOfDecomposedSetCCImpl(
+ DAG, DL, ISD::getSetCCSwappedOperands(CC), Op1, Op0, OriginalDemandedBits,
+ OriginalDemandedElts, AllowNOT, Depth);
+}
+
// Simplify variable target shuffle masks based on the demanded elements.
// TODO: Handle DemandedBits in mask indices as well?
bool X86TargetLowering::SimplifyDemandedVectorEltsForTargetShuffle(
@@ -42472,13 +42620,26 @@ bool X86TargetLowering::SimplifyDemandedBitsForTargetNode(
}
break;
}
- case X86ISD::PCMPGT:
- // icmp sgt(0, R) == ashr(R, BitWidth-1).
- // iff we only need the sign bit then we can use R directly.
- if (OriginalDemandedBits.isSignMask() &&
- ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
- return TLO.CombineTo(Op, Op.getOperand(1));
+ case X86ISD::PCMPGT: {
+ SDLoc DL(Op);
+ if (SDValue R = simplifySExtOfDecomposedSetCC(
+ TLO.DAG, DL, ISD::SETGT, Op.getOperand(0), Op.getOperand(1),
+ OriginalDemandedBits, OriginalDemandedElts,
+ /*AllowNOT*/ true, Depth))
+ return TLO.CombineTo(Op, R);
+ break;
+ }
+ case X86ISD::CMPP: {
+ SDLoc DL(Op);
+ ISD::CondCode CC = X86::getCondForCMPPImm(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+ if (SDValue R = simplifySExtOfDecomposedSetCC(
+ TLO.DAG, DL, CC, Op.getOperand(0), Op.getOperand(1),
+ OriginalDemandedBits, OriginalDemandedElts,
+ !(TLO.LegalOperations() && TLO.LegalTypes()), Depth))
+ return TLO.CombineTo(Op, R);
break;
+ }
case X86ISD::MOVMSK: {
SDValue Src = Op.getOperand(0);
MVT SrcVT = Src.getSimpleValueType();
@@ -42662,13 +42823,25 @@ SDValue X86TargetLowering::SimplifyMultipleUseDemandedBitsForTargetNode(
if (DemandedBits.isSignMask())
return Op.getOperand(0);
break;
- case X86ISD::PCMPGT:
- // icmp sgt(0, R) == ashr(R, BitWidth-1).
- // iff we only need the sign bit then we can use R directly.
- if (DemandedBits.isSignMask() &&
- ISD::isBuildVectorAllZeros(Op.getOperand(0).getNode()))
- return Op.getOperand(1);
+ case X86ISD::PCMPGT: {
+ SDLoc DL(Op);
+ if (SDValue R = simplifySExtOfDecomposedSetCC(
+ DAG, DL, ISD::SETGT, Op.getOperand(0), Op.getOperand(1),
+ DemandedBits, DemandedElts, /*AllowNOT*/ false, Depth))
+ return R;
+ break;
+ }
+ case X86ISD::CMPP: {
+ SDLoc DL(Op);
+ ISD::CondCode CC = X86::getCondForCMPPImm(
+ cast<ConstantSDNode>(Op.getOperand(2))->getZExtValue());
+ if (SDValue R = simplifySExtOfDecomposedSetCC(DAG, DL, CC, Op.getOperand(0),
+ Op.getOperand(1),
+ DemandedBits, DemandedElts,
+ /*AllowNOT*/ false, Depth))
+ return R;
break;
+ }
case X86ISD::BLENDV: {
// BLENDV: Cond (MSB) ? LHS : RHS
SDValue Cond = Op.getOperand(0);
diff --git a/llvm/lib/Target/X86/X86InstrInfo.cpp b/llvm/lib/Target/X86/X86InstrInfo.cpp
index 3f0557e651f89b..2e331efd9c3d0b 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.cpp
+++ b/llvm/lib/Target/X86/X86InstrInfo.cpp
@@ -3349,6 +3349,46 @@ unsigned X86::getVPCMPImmForCond(ISD::CondCode CC) {
}
}
+ISD::CondCode X86::getCondForCMPPImm(unsigned Imm) {
+ assert(Imm <= 0x1f && "Invalid CMPP Imm");
+ switch (Imm & 0xf) {
+ default:
+ llvm_unreachable("Invalid CMPP Imm");
+ case 0:
+ return ISD::SETOEQ;
+ case 1:
+ return ISD::SETOLT;
+ case 2:
+ return ISD::SETOLE;
+ case 3:
+ return ISD::SETUO;
+ case 4:
+ return ISD::SETUNE;
+ case 5:
+ return ISD::SETUGE;
+ case 6:
+ return ISD::SETUGT;
+ case 7:
+ return ISD::SETO;
+ case 8:
+ return ISD::SETUEQ;
+ case 9:
+ return ISD::SETULT;
+ case 10:
+ return ISD::SETULE;
+ case 11:
+ return ISD::SETFALSE;
+ case 12:
+ return ISD::SETONE;
+ case 13:
+ return ISD::SETOGE;
+ case 14:
+ return ISD::SETOGT;
+ case 15:
+ return ISD::SETTRUE;
+ }
+}
+
/// Get the VPCMP immediate if the operands are swapped.
unsigned X86::getSwappedVPCMPImm(unsigned Imm) {
switch (Imm) {
diff --git a/llvm/lib/Target/X86/X86InstrInfo.h b/llvm/lib/Target/X86/X86InstrInfo.h
index 0e5fcbeda08f79..4569a74aab54eb 100644
--- a/llvm/lib/Target/X86/X86InstrInfo.h
+++ b/llvm/lib/Target/X86/X86InstrInfo.h
@@ -68,6 +68,9 @@ CondCode GetOppositeBranchCondition(CondCode CC);
/// Get the VPCMP immediate for the given condition.
unsigned getVPCMPImmForCond(ISD::CondCode CC);
+/// Get the CondCode from a CMPP immediate.
+ISD::CondCode getCondForCMPPImm(unsigned Imm);
+
/// Get the VPCMP immediate if the opcodes are swapped.
unsigned getSwappedVPCMPImm(unsigned Imm);
diff --git a/llvm/test/CodeGen/X86/combine-testps.ll b/llvm/test/CodeGen/X86/combine-testps.ll
index 66165ce2aa53a5..43dddbecf51a7d 100644
--- a/llvm/test/CodeGen/X86/combine-testps.ll
+++ b/llvm/test/CodeGen/X86/combine-testps.ll
@@ -171,24 +171,13 @@ define i32 @testpsz_128_signbit(<4 x float> %c, <4 x float> %d, i32 %a, i32 %b)
}
define i32 @testpsnzc_256_signbit(<8 x float> %c, <8 x float> %d, i32 %a, i32 %b) {
-; AVX-LABEL: testpsnzc_256_signbit:
-; AVX: # %bb.0:
-; AVX-NEXT: movl %edi, %eax
-; AVX-NEXT: vcvtdq2ps %ymm0, %ymm0
-; AVX-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vcmpltps %ymm2, %ymm0, %ymm0
-; AVX-NEXT: vtestps %ymm1, %ymm0
-; AVX-NEXT: cmovnel %esi, %eax
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
-;
-; AVX2-LABEL: testpsnzc_256_signbit:
-; AVX2: # %bb.0:
-; AVX2-NEXT: movl %edi, %eax
-; AVX2-NEXT: vtestps %ymm1, %ymm0
-; AVX2-NEXT: cmovnel %esi, %eax
-; AVX2-NEXT: vzeroupper
-; AVX2-NEXT: retq
+; CHECK-LABEL: testpsnzc_256_signbit:
+; CHECK: # %bb.0:
+; CHECK-NEXT: movl %edi, %eax
+; CHECK-NEXT: vtestps %ymm1, %ymm0
+; CHECK-NEXT: cmovnel %esi, %eax
+; CHECK-NEXT: vzeroupper
+; CHECK-NEXT: retq
%t0 = bitcast <8 x float> %c to <8 x i32>
%t1 = icmp sgt <8 x i32> zeroinitializer, %t0
%t2 = sext <8 x i1> %t1 to <8 x i32>
diff --git a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
index 6aad4c2ebba1d8..8254af9f29d03e 100644
--- a/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
+++ b/llvm/test/CodeGen/X86/fpclamptosat_vec.ll
@@ -47,17 +47,15 @@ define <2 x i32> @stest_f64i32(<2 x double> %x) nounwind {
; AVX2-LABEL: stest_f64i32:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: retq
;
@@ -134,9 +132,7 @@ define <2 x i32> @utest_f64i32(<2 x double> %x) nounwind {
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: retq
;
@@ -319,9 +315,8 @@ define <4 x i32> @stest_f32i32(<4 x float> %x) nounwind {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -458,11 +453,7 @@ define <4 x i32> @utest_f32i32(<4 x float> %x) nounwind {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -701,17 +692,17 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vcvttss2si %xmm1, %rcx
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvttss2si %xmm2, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
@@ -719,9 +710,8 @@ define <4 x i32> @stest_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -880,11 +870,7 @@ define <4 x i32> @utesth_f16i32(<4 x half> %x) nounwind {
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -2669,17 +2655,15 @@ define <2 x i32> @stest_f64i32_mm(<2 x double> %x) nounwind {
; AVX2-LABEL: stest_f64i32_mm:
; AVX2: # %bb.0: # %entry
; AVX2-NEXT: vcvttsd2si %xmm0, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: vshufpd {{.*#+}} xmm0 = xmm0[1,0]
-; AVX2-NEXT: vcvttsd2si %xmm0, %rcx
+; AVX2-NEXT: vcvttsd2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: retq
;
@@ -2754,9 +2738,7 @@ define <2 x i32> @utest_f64i32_mm(<2 x double> %x) nounwind {
; AVX2-NEXT: orq %rcx, %rdx
; AVX2-NEXT: vmovq %rdx, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: retq
;
@@ -2936,9 +2918,8 @@ define <4 x i32> @stest_f32i32_mm(<4 x float> %x) nounwind {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -3072,12 +3053,8 @@ define <4 x i32> @utest_f32i32_mm(<4 x float> %x) nounwind {
; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -3313,17 +3290,17 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: vpshuflw {{.*#+}} xmm1 = xmm0[3,3,3,3,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovshdup {{.*#+}} xmm1 = xmm0[1,1,3,3]
-; AVX2-NEXT: vcvtph2ps %xmm1, %xmm1
-; AVX2-NEXT: vcvttss2si %xmm1, %rcx
-; AVX2-NEXT: vcvtph2ps %xmm0, %xmm1
+; AVX2-NEXT: vmovq %rax, %xmm1
+; AVX2-NEXT: vmovshdup {{.*#+}} xmm2 = xmm0[1,1,3,3]
+; AVX2-NEXT: vcvtph2ps %xmm2, %xmm2
+; AVX2-NEXT: vcvttss2si %xmm2, %rax
+; AVX2-NEXT: vmovq %rax, %xmm2
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
+; AVX2-NEXT: vcvtph2ps %xmm0, %xmm2
+; AVX2-NEXT: vcvttss2si %xmm2, %rax
; AVX2-NEXT: vmovq %rax, %xmm2
-; AVX2-NEXT: vcvttss2si %xmm1, %rax
-; AVX2-NEXT: vmovq %rcx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm1[0],xmm2[0]
; AVX2-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[1,1,1,1,4,5,6,7]
; AVX2-NEXT: vcvtph2ps %xmm0, %xmm0
-; AVX2-NEXT: vmovq %rax, %xmm2
; AVX2-NEXT: vcvttss2si %xmm0, %rax
; AVX2-NEXT: vmovq %rax, %xmm0
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm2[0],xmm0[0]
@@ -3331,9 +3308,8 @@ define <4 x i32> @stest_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -3489,12 +3465,8 @@ define <4 x i32> @utesth_f16i32_mm(<4 x half> %x) nounwind {
; AVX2-NEXT: vmovq %rdx, %xmm1
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [4294967295,4294967295,4294967295,4294967295]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-NEXT: vpermps %ymm0, %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/i64-to-float.ll b/llvm/test/CodeGen/X86/i64-to-float.ll
index 0a9da876428844..cd2c146c0a3b8b 100644
--- a/llvm/test/CodeGen/X86/i64-to-float.ll
+++ b/llvm/test/CodeGen/X86/i64-to-float.ll
@@ -352,9 +352,7 @@ define <2 x double> @clamp_sitofp_2i64_2f64(<2 x i64> %a) nounwind {
;
; X64-AVX-LABEL: clamp_sitofp_2i64_2f64:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709551361,18446744073709551361]
-; X64-AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; X64-AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; X64-AVX-NEXT: vpmovzxbq {{.*#+}} xmm1 = [255,255]
; X64-AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; X64-AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index bc557f75f02a25..ea6b707a0022b8 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -181,43 +181,36 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: pxor %xmm7, %xmm7
-; SSE4-NEXT: pmovsxdq {{.*#+}} xmm10 = [2147483647,2147483647]
-; SSE4-NEXT: movdqa %xmm10, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: pxor %xmm9, %xmm9
+; SSE4-NEXT: pmovsxdq {{.*#+}} xmm8 = [2147483647,2147483647]
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm10, %xmm8
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE4-NEXT: movdqa %xmm10, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm10, %xmm9
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm9
-; SSE4-NEXT: movdqa %xmm10, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm10, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE4-NEXT: movdqa %xmm10, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm10
-; SSE4-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; SSE4-NEXT: movapd %xmm10, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm10, %xmm6
-; SSE4-NEXT: movapd %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm2
+; SSE4-NEXT: movdqa %xmm8, %xmm2
; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE4-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,2],xmm6[0,2]
-; SSE4-NEXT: movapd %xmm9, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm3
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; SSE4-NEXT: movapd %xmm8, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm7, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: movapd %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE4-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm8[0,2]
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: movapd %xmm6, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE4-NEXT: pcmpeqd %xmm9, %xmm5
+; SSE4-NEXT: pcmpeqd %xmm9, %xmm4
; SSE4-NEXT: packssdw %xmm5, %xmm4
; SSE4-NEXT: packsswb %xmm4, %xmm4
; SSE4-NEXT: pmovmskb %xmm4, %eax
@@ -234,9 +227,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB0_8
; SSE4-NEXT: .LBB0_7: # %cond.store5
-; SSE4-NEXT: extractps $3, %xmm2, 12(%rdi)
+; SSE4-NEXT: extractps $3, %xmm3, 12(%rdi)
; SSE4-NEXT: .LBB0_8: # %else6
-; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
+; SSE4-NEXT: shufps {{.*#+}} xmm6 = xmm6[0,2],xmm2[0,2]
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: jne .LBB0_9
; SSE4-NEXT: # %bb.10: # %else8
@@ -251,32 +244,32 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: .LBB0_16: # %else14
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB0_1: # %cond.store
-; SSE4-NEXT: movss %xmm2, (%rdi)
+; SSE4-NEXT: movss %xmm3, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB0_4
; SSE4-NEXT: .LBB0_3: # %cond.store1
-; SSE4-NEXT: extractps $1, %xmm2, 4(%rdi)
+; SSE4-NEXT: extractps $1, %xmm3, 4(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB0_6
; SSE4-NEXT: .LBB0_5: # %cond.store3
-; SSE4-NEXT: extractps $2, %xmm2, 8(%rdi)
+; SSE4-NEXT: extractps $2, %xmm3, 8(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: jne .LBB0_7
; SSE4-NEXT: jmp .LBB0_8
; SSE4-NEXT: .LBB0_9: # %cond.store7
-; SSE4-NEXT: movss %xmm1, 16(%rdi)
+; SSE4-NEXT: movss %xmm6, 16(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB0_12
; SSE4-NEXT: .LBB0_11: # %cond.store9
-; SSE4-NEXT: extractps $1, %xmm1, 20(%rdi)
+; SSE4-NEXT: extractps $1, %xmm6, 20(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB0_14
; SSE4-NEXT: .LBB0_13: # %cond.store11
-; SSE4-NEXT: extractps $2, %xmm1, 24(%rdi)
+; SSE4-NEXT: extractps $2, %xmm6, 24(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB0_16
; SSE4-NEXT: .LBB0_15: # %cond.store13
-; SSE4-NEXT: extractps $3, %xmm1, 28(%rdi)
+; SSE4-NEXT: extractps $3, %xmm6, 28(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i32:
@@ -298,14 +291,10 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
; AVX1-NEXT: # xmm4 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm0, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm5, %xmm5
+; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm3, %ymm3
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
@@ -324,11 +313,9 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm3, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vperm2f128 {{.*#+}} ymm3 = ymm0[2,3],ymm1[2,3]
; AVX2-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm3[0,2],ymm0[4,6],ymm3[4,6]
@@ -558,45 +545,38 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i16:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: pxor %xmm7, %xmm7
-; SSE4-NEXT: pmovsxwq {{.*#+}} xmm9 = [32767,32767]
-; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: pxor %xmm10, %xmm10
+; SSE4-NEXT: pmovsxwq {{.*#+}} xmm8 = [32767,32767]
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm9, %xmm8
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm9, %xmm2
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE4-NEXT: movdqa %xmm9, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm9, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm9
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm9
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm2
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9
-; SSE4-NEXT: pmovsxwq {{.*#+}} xmm6 = [18446744073709518848,18446744073709518848]
-; SSE4-NEXT: movapd %xmm9, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm6, %xmm10
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm10
-; SSE4-NEXT: movapd %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm6, %xmm1
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE4-NEXT: packssdw %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm2, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm6, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; SSE4-NEXT: movapd %xmm8, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6
-; SSE4-NEXT: packssdw %xmm3, %xmm6
-; SSE4-NEXT: packssdw %xmm6, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm7, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: packssdw %xmm8, %xmm2
+; SSE4-NEXT: movapd %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9
+; SSE4-NEXT: movapd %xmm6, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE4-NEXT: packssdw %xmm9, %xmm6
+; SSE4-NEXT: packssdw %xmm6, %xmm2
+; SSE4-NEXT: pcmpeqd %xmm10, %xmm5
+; SSE4-NEXT: pcmpeqd %xmm10, %xmm4
; SSE4-NEXT: packssdw %xmm5, %xmm4
; SSE4-NEXT: packsswb %xmm4, %xmm4
; SSE4-NEXT: pmovmskb %xmm4, %eax
@@ -627,35 +607,35 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: .LBB1_16: # %else14
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB1_1: # %cond.store
-; SSE4-NEXT: pextrw $0, %xmm1, (%rdi)
+; SSE4-NEXT: pextrw $0, %xmm2, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB1_4
; SSE4-NEXT: .LBB1_3: # %cond.store1
-; SSE4-NEXT: pextrw $1, %xmm1, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm2, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB1_6
; SSE4-NEXT: .LBB1_5: # %cond.store3
-; SSE4-NEXT: pextrw $2, %xmm1, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm2, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB1_8
; SSE4-NEXT: .LBB1_7: # %cond.store5
-; SSE4-NEXT: pextrw $3, %xmm1, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm2, 6(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB1_10
; SSE4-NEXT: .LBB1_9: # %cond.store7
-; SSE4-NEXT: pextrw $4, %xmm1, 8(%rdi)
+; SSE4-NEXT: pextrw $4, %xmm2, 8(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB1_12
; SSE4-NEXT: .LBB1_11: # %cond.store9
-; SSE4-NEXT: pextrw $5, %xmm1, 10(%rdi)
+; SSE4-NEXT: pextrw $5, %xmm2, 10(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB1_14
; SSE4-NEXT: .LBB1_13: # %cond.store11
-; SSE4-NEXT: pextrw $6, %xmm1, 12(%rdi)
+; SSE4-NEXT: pextrw $6, %xmm2, 12(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB1_16
; SSE4-NEXT: .LBB1_15: # %cond.store13
-; SSE4-NEXT: pextrw $7, %xmm1, 14(%rdi)
+; SSE4-NEXT: pextrw $7, %xmm2, 14(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i16:
@@ -671,16 +651,13 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm1
@@ -754,11 +731,9 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
@@ -1096,46 +1071,39 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v8i64_v8i8:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm6
-; SSE4-NEXT: pxor %xmm7, %xmm7
-; SSE4-NEXT: pmovsxbq {{.*#+}} xmm9 = [127,127]
-; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: movdqa %xmm0, %xmm7
+; SSE4-NEXT: pxor %xmm10, %xmm10
+; SSE4-NEXT: pmovsxbq {{.*#+}} xmm8 = [127,127]
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: movdqa %xmm9, %xmm8
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm8
-; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm6
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm6
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm9, %xmm2
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
-; SSE4-NEXT: movdqa %xmm9, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm9, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm3
-; SSE4-NEXT: movdqa %xmm9, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm9
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm9
+; SSE4-NEXT: movdqa %xmm8, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm7, %xmm0
+; SSE4-NEXT: movdqa %xmm8, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, %xmm7, %xmm2
+; SSE4-NEXT: movdqa %xmm8, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9
-; SSE4-NEXT: pmovsxbq {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
-; SSE4-NEXT: movapd %xmm9, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm6, %xmm10
-; SSE4-NEXT: blendvpd %xmm0, %xmm9, %xmm10
-; SSE4-NEXT: movapd %xmm3, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm6, %xmm1
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm1
-; SSE4-NEXT: packssdw %xmm10, %xmm1
-; SSE4-NEXT: movapd %xmm2, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: movdqa %xmm6, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; SSE4-NEXT: movapd %xmm8, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm6, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm8, %xmm6
-; SSE4-NEXT: packssdw %xmm3, %xmm6
-; SSE4-NEXT: packssdw %xmm6, %xmm1
-; SSE4-NEXT: packsswb %xmm1, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm7, %xmm5
-; SSE4-NEXT: pcmpeqd %xmm7, %xmm4
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm8
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm2
+; SSE4-NEXT: packssdw %xmm8, %xmm2
+; SSE4-NEXT: movapd %xmm9, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm9
+; SSE4-NEXT: movapd %xmm6, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm6
+; SSE4-NEXT: packssdw %xmm9, %xmm6
+; SSE4-NEXT: packssdw %xmm6, %xmm2
+; SSE4-NEXT: packsswb %xmm2, %xmm2
+; SSE4-NEXT: pcmpeqd %xmm10, %xmm5
+; SSE4-NEXT: pcmpeqd %xmm10, %xmm4
; SSE4-NEXT: packssdw %xmm5, %xmm4
; SSE4-NEXT: packsswb %xmm4, %xmm4
; SSE4-NEXT: pmovmskb %xmm4, %eax
@@ -1166,35 +1134,35 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; SSE4-NEXT: .LBB2_16: # %else14
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB2_1: # %cond.store
-; SSE4-NEXT: pextrb $0, %xmm1, (%rdi)
+; SSE4-NEXT: pextrb $0, %xmm2, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB2_4
; SSE4-NEXT: .LBB2_3: # %cond.store1
-; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB2_6
; SSE4-NEXT: .LBB2_5: # %cond.store3
-; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm2, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB2_8
; SSE4-NEXT: .LBB2_7: # %cond.store5
-; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm2, 3(%rdi)
; SSE4-NEXT: testb $16, %al
; SSE4-NEXT: je .LBB2_10
; SSE4-NEXT: .LBB2_9: # %cond.store7
-; SSE4-NEXT: pextrb $4, %xmm1, 4(%rdi)
+; SSE4-NEXT: pextrb $4, %xmm2, 4(%rdi)
; SSE4-NEXT: testb $32, %al
; SSE4-NEXT: je .LBB2_12
; SSE4-NEXT: .LBB2_11: # %cond.store9
-; SSE4-NEXT: pextrb $5, %xmm1, 5(%rdi)
+; SSE4-NEXT: pextrb $5, %xmm2, 5(%rdi)
; SSE4-NEXT: testb $64, %al
; SSE4-NEXT: je .LBB2_14
; SSE4-NEXT: .LBB2_13: # %cond.store11
-; SSE4-NEXT: pextrb $6, %xmm1, 6(%rdi)
+; SSE4-NEXT: pextrb $6, %xmm2, 6(%rdi)
; SSE4-NEXT: testb $-128, %al
; SSE4-NEXT: je .LBB2_16
; SSE4-NEXT: .LBB2_15: # %cond.store13
-; SSE4-NEXT: pextrb $7, %xmm1, 7(%rdi)
+; SSE4-NEXT: pextrb $7, %xmm2, 7(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v8i64_v8i8:
@@ -1210,16 +1178,13 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm6
; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm5, %xmm6
-; AVX1-NEXT: vblendvpd %xmm6, %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm5, %xmm5
; AVX1-NEXT: vpackssdw %xmm0, %xmm5, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpackssdw %xmm1, %xmm3, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
@@ -1294,11 +1259,9 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
@@ -1555,27 +1518,24 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v4i64_v4i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm3
-; SSE4-NEXT: pxor %xmm4, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm4
+; SSE4-NEXT: pxor %xmm6, %xmm6
; SSE4-NEXT: pmovsxdq {{.*#+}} xmm5 = [2147483647,2147483647]
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm5, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT: movdqa %xmm5, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm3
; SSE4-NEXT: movdqa %xmm5, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE4-NEXT: pmovsxdq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
+; SSE4-NEXT: movapd {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; SSE4-NEXT: movapd %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3
-; SSE4-NEXT: movapd %xmm6, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE4-NEXT: shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE4-NEXT: movmskps %xmm4, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT: movapd %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE4-NEXT: shufps {{.*#+}} xmm3 = xmm3[0,2],xmm5[0,2]
+; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE4-NEXT: movmskps %xmm6, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB3_1
@@ -1591,19 +1551,19 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE4-NEXT: .LBB3_8: # %else6
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB3_1: # %cond.store
-; SSE4-NEXT: movss %xmm1, (%rdi)
+; SSE4-NEXT: movss %xmm3, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB3_4
; SSE4-NEXT: .LBB3_3: # %cond.store1
-; SSE4-NEXT: extractps $1, %xmm1, 4(%rdi)
+; SSE4-NEXT: extractps $1, %xmm3, 4(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB3_6
; SSE4-NEXT: .LBB3_5: # %cond.store3
-; SSE4-NEXT: extractps $2, %xmm1, 8(%rdi)
+; SSE4-NEXT: extractps $2, %xmm3, 8(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB3_8
; SSE4-NEXT: .LBB3_7: # %cond.store5
-; SSE4-NEXT: extractps $3, %xmm1, 12(%rdi)
+; SSE4-NEXT: extractps $3, %xmm3, 12(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i64_v4i32:
@@ -1621,10 +1581,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm3, %xmm2
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm2[0,2],xmm0[0,2]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: vzeroupper
@@ -1639,9 +1597,8 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
@@ -1788,28 +1745,25 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v4i64_v4i16:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm3
-; SSE4-NEXT: pxor %xmm4, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm4
+; SSE4-NEXT: pxor %xmm6, %xmm6
; SSE4-NEXT: pmovsxwq {{.*#+}} xmm5 = [32767,32767]
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm5, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT: movdqa %xmm5, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm3
; SSE4-NEXT: movdqa %xmm5, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE4-NEXT: pmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; SSE4-NEXT: movapd {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
; SSE4-NEXT: movapd %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3
-; SSE4-NEXT: movapd %xmm6, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE4-NEXT: packssdw %xmm3, %xmm1
-; SSE4-NEXT: packssdw %xmm1, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE4-NEXT: movmskps %xmm4, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT: movapd %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE4-NEXT: packssdw %xmm5, %xmm3
+; SSE4-NEXT: packssdw %xmm3, %xmm3
+; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE4-NEXT: movmskps %xmm6, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB4_1
@@ -1825,19 +1779,19 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE4-NEXT: .LBB4_8: # %else6
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB4_1: # %cond.store
-; SSE4-NEXT: pextrw $0, %xmm1, (%rdi)
+; SSE4-NEXT: pextrw $0, %xmm3, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB4_4
; SSE4-NEXT: .LBB4_3: # %cond.store1
-; SSE4-NEXT: pextrw $1, %xmm1, 2(%rdi)
+; SSE4-NEXT: pextrw $1, %xmm3, 2(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB4_6
; SSE4-NEXT: .LBB4_5: # %cond.store3
-; SSE4-NEXT: pextrw $2, %xmm1, 4(%rdi)
+; SSE4-NEXT: pextrw $2, %xmm3, 4(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB4_8
; SSE4-NEXT: .LBB4_7: # %cond.store5
-; SSE4-NEXT: pextrw $3, %xmm1, 6(%rdi)
+; SSE4-NEXT: pextrw $3, %xmm3, 6(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i64_v4i16:
@@ -1849,11 +1803,10 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm1, %xmm1
@@ -1896,9 +1849,8 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [32767,32767,32767,32767]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
@@ -2103,29 +2055,26 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
;
; SSE4-LABEL: truncstore_v4i64_v4i8:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm3
-; SSE4-NEXT: pxor %xmm4, %xmm4
+; SSE4-NEXT: movdqa %xmm0, %xmm4
+; SSE4-NEXT: pxor %xmm6, %xmm6
; SSE4-NEXT: pmovsxbq {{.*#+}} xmm5 = [127,127]
; SSE4-NEXT: movdqa %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE4-NEXT: movdqa %xmm5, %xmm6
-; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm6
+; SSE4-NEXT: pcmpgtq %xmm4, %xmm0
+; SSE4-NEXT: movdqa %xmm5, %xmm3
+; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm3
; SSE4-NEXT: movdqa %xmm5, %xmm0
; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
-; SSE4-NEXT: pmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; SSE4-NEXT: movapd {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
; SSE4-NEXT: movapd %xmm5, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: movdqa %xmm1, %xmm3
-; SSE4-NEXT: blendvpd %xmm0, %xmm5, %xmm3
-; SSE4-NEXT: movapd %xmm6, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm1, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm6, %xmm1
-; SSE4-NEXT: packssdw %xmm3, %xmm1
-; SSE4-NEXT: packssdw %xmm1, %xmm1
-; SSE4-NEXT: packsswb %xmm1, %xmm1
-; SSE4-NEXT: pcmpeqd %xmm2, %xmm4
-; SSE4-NEXT: movmskps %xmm4, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm5
+; SSE4-NEXT: movapd %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm1, %xmm3
+; SSE4-NEXT: packssdw %xmm5, %xmm3
+; SSE4-NEXT: packssdw %xmm3, %xmm3
+; SSE4-NEXT: packsswb %xmm3, %xmm3
+; SSE4-NEXT: pcmpeqd %xmm2, %xmm6
+; SSE4-NEXT: movmskps %xmm6, %eax
; SSE4-NEXT: xorl $15, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB5_1
@@ -2141,19 +2090,19 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; SSE4-NEXT: .LBB5_8: # %else6
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB5_1: # %cond.store
-; SSE4-NEXT: pextrb $0, %xmm1, (%rdi)
+; SSE4-NEXT: pextrb $0, %xmm3, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB5_4
; SSE4-NEXT: .LBB5_3: # %cond.store1
-; SSE4-NEXT: pextrb $1, %xmm1, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm3, 1(%rdi)
; SSE4-NEXT: testb $4, %al
; SSE4-NEXT: je .LBB5_6
; SSE4-NEXT: .LBB5_5: # %cond.store3
-; SSE4-NEXT: pextrb $2, %xmm1, 2(%rdi)
+; SSE4-NEXT: pextrb $2, %xmm3, 2(%rdi)
; SSE4-NEXT: testb $8, %al
; SSE4-NEXT: je .LBB5_8
; SSE4-NEXT: .LBB5_7: # %cond.store5
-; SSE4-NEXT: pextrb $3, %xmm1, 3(%rdi)
+; SSE4-NEXT: pextrb $3, %xmm3, 3(%rdi)
; SSE4-NEXT: retq
;
; AVX1-LABEL: truncstore_v4i64_v4i8:
@@ -2165,11 +2114,10 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm3, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm4, %xmm3
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
@@ -2213,9 +2161,8 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [127,127,127,127]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm3, %ymm4
; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm0, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackssdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
@@ -2377,18 +2324,16 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-LABEL: truncstore_v2i64_v2i32:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: pmovsxdq {{.*#+}} xmm4 = [2147483647,2147483647]
-; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE4-NEXT: pmovsxdq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
-; SSE4-NEXT: movapd %xmm4, %xmm0
+; SSE4-NEXT: pxor %xmm4, %xmm4
+; SSE4-NEXT: pmovsxdq {{.*#+}} xmm3 = [2147483647,2147483647]
+; SSE4-NEXT: movdqa %xmm3, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
-; SSE4-NEXT: movmskpd %xmm3, %eax
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE4-NEXT: movapd %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
+; SSE4-NEXT: movmskpd %xmm4, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB6_1
@@ -2416,10 +2361,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX1-NEXT: # xmm2 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -2434,9 +2376,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm2 = [18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm3
-; AVX2-NEXT: vblendvpd %xmm3, %xmm0, %xmm2, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
; AVX2-NEXT: retq
@@ -2539,19 +2479,17 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-LABEL: truncstore_v2i64_v2i16:
; SSE4: # %bb.0:
; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: pmovsxwq {{.*#+}} xmm4 = [32767,32767]
-; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE4-NEXT: pmovsxwq {{.*#+}} xmm2 = [18446744073709518848,18446744073709518848]
-; SSE4-NEXT: movapd %xmm4, %xmm0
+; SSE4-NEXT: pxor %xmm4, %xmm4
+; SSE4-NEXT: pmovsxwq {{.*#+}} xmm3 = [32767,32767]
+; SSE4-NEXT: movdqa %xmm3, %xmm0
; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm3
+; SSE4-NEXT: movapd %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm3[0,2,2,3]
; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
-; SSE4-NEXT: movmskpd %xmm3, %eax
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
+; SSE4-NEXT: movmskpd %xmm4, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB7_1
@@ -2574,9 +2512,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [32767,32767]
; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpmovsxwq {{.*#+}} xmm3 = [18446744073709518848,18446744073709518848]
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
@@ -2706,19 +2642,17 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
;
; SSE4-LABEL: truncstore_v2i64_v2i8:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: pmovsxbq {{.*#+}} xmm4 = [127,127]
-; SSE4-NEXT: movdqa %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm2, %xmm4
-; SSE4-NEXT: pmovsxbq {{.*#+}} xmm2 = [18446744073709551488,18446744073709551488]
-; SSE4-NEXT: movapd %xmm4, %xmm0
-; SSE4-NEXT: pcmpgtq %xmm2, %xmm0
-; SSE4-NEXT: blendvpd %xmm0, %xmm4, %xmm2
+; SSE4-NEXT: movdqa %xmm0, %xmm3
+; SSE4-NEXT: pxor %xmm4, %xmm4
+; SSE4-NEXT: pmovsxbq {{.*#+}} xmm2 = [127,127]
+; SSE4-NEXT: movdqa %xmm2, %xmm0
+; SSE4-NEXT: pcmpgtq %xmm3, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE4-NEXT: movapd %xmm2, %xmm0
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
-; SSE4-NEXT: movmskpd %xmm3, %eax
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm4
+; SSE4-NEXT: movmskpd %xmm4, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB8_1
@@ -2741,9 +2675,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [127,127]
; AVX-NEXT: vpcmpgtq %xmm0, %xmm3, %xmm4
; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
-; AVX-NEXT: vpmovsxbq {{.*#+}} xmm3 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT: vpcmpgtq %xmm3, %xmm0, %xmm4
-; AVX-NEXT: vblendvpd %xmm4, %xmm0, %xmm3, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 8215c52aabfbef..ff84c449a6a462 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -1395,11 +1395,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [4294967295,4294967295,4294967295,4294967295]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm3
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm2
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm2[0,2]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
@@ -1620,11 +1616,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [65535,65535,65535,65535]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342]
-; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -1903,11 +1895,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
; AVX2: # %bb.0:
; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm3 = [255,255,255,255]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm4, %ymm0, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [9223372036854776062,9223372036854776062,9223372036854776062,9223372036854776062]
-; AVX2-NEXT: vpcmpgtq %ymm5, %ymm4, %ymm4
-; AVX2-NEXT: vblendvpd %ymm4, %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm3, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX2-NEXT: vpackusdw %xmm3, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2050,15 +2038,11 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
;
; SSE4-LABEL: truncstore_v2i64_v2i32:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm2, %xmm0
-; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
-; SSE4-NEXT: movmskpd %xmm3, %eax
+; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm2
+; SSE4-NEXT: movmskpd %xmm2, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB6_1
@@ -2082,9 +2066,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmaskmovps %xmm0, %xmm1, (%rdi)
; AVX1-NEXT: retq
@@ -2096,9 +2078,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX2-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX2-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,2],zero,zero
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm2
-; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vpmaskmovd %xmm0, %xmm1, (%rdi)
; AVX2-NEXT: retq
@@ -2182,16 +2162,12 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
;
; SSE4-LABEL: truncstore_v2i64_v2i16:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm2, %xmm0
-; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
+; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SSE4-NEXT: pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
-; SSE4-NEXT: movmskpd %xmm3, %eax
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm2
+; SSE4-NEXT: movmskpd %xmm2, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB7_1
@@ -2211,9 +2187,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; AVX-LABEL: truncstore_v2i64_v2i16:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; AVX-NEXT: vblendvpd %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
@@ -2324,15 +2298,11 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
;
; SSE4-LABEL: truncstore_v2i64_v2i8:
; SSE4: # %bb.0:
-; SSE4-NEXT: movdqa %xmm0, %xmm2
-; SSE4-NEXT: pxor %xmm3, %xmm3
-; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; SSE4-NEXT: pxor %xmm2, %xmm0
-; SSE4-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; SSE4-NEXT: pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
-; SSE4-NEXT: pcmpeqq %xmm1, %xmm3
-; SSE4-NEXT: movmskpd %xmm3, %eax
+; SSE4-NEXT: pxor %xmm2, %xmm2
+; SSE4-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE4-NEXT: pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
+; SSE4-NEXT: pcmpeqq %xmm1, %xmm2
+; SSE4-NEXT: movmskpd %xmm2, %eax
; SSE4-NEXT: xorl $3, %eax
; SSE4-NEXT: testb $1, %al
; SSE4-NEXT: jne .LBB8_1
@@ -2342,19 +2312,17 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
; SSE4-NEXT: .LBB8_4: # %else2
; SSE4-NEXT: retq
; SSE4-NEXT: .LBB8_1: # %cond.store
-; SSE4-NEXT: pextrb $0, %xmm2, (%rdi)
+; SSE4-NEXT: pextrb $0, %xmm0, (%rdi)
; SSE4-NEXT: testb $2, %al
; SSE4-NEXT: je .LBB8_4
; SSE4-NEXT: .LBB8_3: # %cond.store1
-; SSE4-NEXT: pextrb $1, %xmm2, 1(%rdi)
+; SSE4-NEXT: pextrb $1, %xmm0, 1(%rdi)
; SSE4-NEXT: retq
;
; AVX-LABEL: truncstore_v2i64_v2i8:
; AVX: # %bb.0:
; AVX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm3
-; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
-; AVX-NEXT: vblendvpd %xmm3, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpcmpeqq %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovmskpd %xmm1, %eax
diff --git a/llvm/test/CodeGen/X86/pr81136.ll b/llvm/test/CodeGen/X86/pr81136.ll
index b4ac3fc783e0a9..2cc486c41dd66b 100644
--- a/llvm/test/CodeGen/X86/pr81136.ll
+++ b/llvm/test/CodeGen/X86/pr81136.ll
@@ -7,22 +7,20 @@ define i64 @PR81136(i32 %a0, i32 %a1, ptr %a2) {
; CHECK-NEXT: vmovd %edi, %xmm0
; CHECK-NEXT: vmovd %esi, %xmm1
; CHECK-NEXT: vmovdqa (%rdx), %ymm2
-; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; CHECK-NEXT: vpmovzxbq {{.*#+}} xmm4 = [128,1]
-; CHECK-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm4
+; CHECK-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm3
+; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
+; CHECK-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
; CHECK-NEXT: vpcmpgtw %xmm0, %xmm1, %xmm0
; CHECK-NEXT: vpcmpeqd %xmm1, %xmm1, %xmm1
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
; CHECK-NEXT: vpmovsxwq %xmm0, %xmm0
; CHECK-NEXT: vpalignr {{.*#+}} xmm0 = mem[8,9,10,11,12,13,14,15],xmm0[0,1,2,3,4,5,6,7]
+; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; CHECK-NEXT: vpxor %xmm3, %xmm3, %xmm3
; CHECK-NEXT: vpcmpeqq %xmm3, %xmm0, %xmm0
; CHECK-NEXT: vpxor %xmm1, %xmm0, %xmm0
-; CHECK-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm1
-; CHECK-NEXT: vextractf128 $1, %ymm2, %xmm2
-; CHECK-NEXT: vpcmpeqq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; CHECK-NEXT: vinsertf128 $1, %xmm0, %ymm4, %ymm0
-; CHECK-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
-; CHECK-NEXT: vandnpd %ymm0, %ymm1, %ymm0
+; CHECK-NEXT: vperm2f128 {{.*#+}} ymm0 = mem[0,1],ymm0[0,1]
+; CHECK-NEXT: vandnpd %ymm0, %ymm2, %ymm0
; CHECK-NEXT: vmovmskpd %ymm0, %eax
; CHECK-NEXT: popcntl %eax, %eax
; CHECK-NEXT: negq %rax
diff --git a/llvm/test/CodeGen/X86/sadd_sat_vec.ll b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
index bc01f0a8ac52c3..b2b242fa29818f 100644
--- a/llvm/test/CodeGen/X86/sadd_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/sadd_sat_vec.ll
@@ -861,9 +861,6 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
; AVX1-NEXT: vpcmpgtd %xmm2, %xmm3, %xmm3
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm1, %ymm1
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vcmpltps %ymm3, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm0, %ymm1, %ymm0
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm1
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
@@ -1062,9 +1059,6 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vpcmpgtd %xmm4, %xmm5, %xmm5
; AVX1-NEXT: vpcmpgtd %xmm6, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vcvtdq2ps %ymm2, %ymm2
-; AVX1-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX1-NEXT: vcmpltps %ymm5, %ymm2, %ymm2
; AVX1-NEXT: vxorps %ymm0, %ymm2, %ymm0
; AVX1-NEXT: vpsrad $31, %xmm6, %xmm2
; AVX1-NEXT: vpsrad $31, %xmm4, %xmm4
@@ -1073,21 +1067,19 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vblendvps %ymm0, %ymm2, %ymm7, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
-; AVX1-NEXT: vpaddd %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm7, %ymm8
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm6, %xmm6
-; AVX1-NEXT: vpcmpgtd %xmm7, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
-; AVX1-NEXT: vcvtdq2ps %ymm3, %ymm3
-; AVX1-NEXT: vcmpltps %ymm5, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm5
+; AVX1-NEXT: vpaddd %xmm2, %xmm5, %xmm2
+; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm6
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm6, %ymm7
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpgtd %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm1, %ymm1
; AVX1-NEXT: vxorps %ymm1, %ymm3, %ymm1
-; AVX1-NEXT: vpsrad $31, %xmm7, %xmm3
+; AVX1-NEXT: vpsrad $31, %xmm6, %xmm3
; AVX1-NEXT: vpsrad $31, %xmm2, %xmm2
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
; AVX1-NEXT: vxorps %ymm4, %ymm2, %ymm2
-; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm8, %ymm1
+; AVX1-NEXT: vblendvps %ymm1, %ymm2, %ymm7, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v16i32:
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index f78b57d895ee18..d688e4e8c10ae5 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -644,20 +644,13 @@ define <2 x i64> @unsigned_sat_constant_v2i64_using_min(<2 x i64> %x) {
;
; SSE42-LABEL: unsigned_sat_constant_v2i64_using_min:
; SSE42: # %bb.0:
-; SSE42-NEXT: movdqa %xmm0, %xmm1
-; SSE42-NEXT: movdqa {{.*#+}} xmm0 = [9223372036854775808,9223372036854775808]
-; SSE42-NEXT: pxor %xmm1, %xmm0
-; SSE42-NEXT: pcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; SSE42-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; SSE42-NEXT: paddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; SSE42-NEXT: retq
;
; AVX2-LABEL: unsigned_sat_constant_v2i64_using_min:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 3dde5c1c8a40c1..2204662ec2688a 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2215,42 +2215,36 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-LABEL: pr51133:
; CHECK-SSE2: # %bb.0:
; CHECK-SSE2-NEXT: movq %rdi, %rax
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm5
-; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm5
-; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm6
-; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm6
-; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE2-NEXT: movdqa %xmm6, %xmm5
-; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm5 = xmm5[8],xmm6[8],xmm5[9],xmm6[9],xmm5[10],xmm6[10],xmm5[11],xmm6[11],xmm5[12],xmm6[12],xmm5[13],xmm6[13],xmm5[14],xmm6[14],xmm5[15],xmm6[15]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm5
-; CHECK-SSE2-NEXT: psrlw $8, %xmm5
-; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE2-NEXT: psrlw $8, %xmm6
-; CHECK-SSE2-NEXT: packuswb %xmm5, %xmm6
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm7 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
-; CHECK-SSE2-NEXT: pminub %xmm6, %xmm7
-; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm7
-; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SSE2-NEXT: pandn %xmm5, %xmm7
-; CHECK-SSE2-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pxor %xmm6, %xmm6
-; CHECK-SSE2-NEXT: pcmpgtb %xmm6, %xmm1
-; CHECK-SSE2-NEXT: pandn %xmm1, %xmm5
-; CHECK-SSE2-NEXT: por %xmm7, %xmm5
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm4
+; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm1
+; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm1
+; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: movdqa %xmm1, %xmm4
+; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: psrlw $8, %xmm4
+; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
+; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE2-NEXT: psrlw $8, %xmm1
+; CHECK-SSE2-NEXT: packuswb %xmm4, %xmm1
+; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm4 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
+; CHECK-SSE2-NEXT: pminub %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pcmpeqb %xmm1, %xmm4
+; CHECK-SSE2-NEXT: pandn {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
+; CHECK-SSE2-NEXT: por {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
; CHECK-SSE2-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm1
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm1
; CHECK-SSE2-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; CHECK-SSE2-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; CHECK-SSE2-NEXT: pand %xmm4, %xmm0
+; CHECK-SSE2-NEXT: pand %xmm5, %xmm0
; CHECK-SSE2-NEXT: packuswb %xmm1, %xmm0
; CHECK-SSE2-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE2-NEXT: movdqa %xmm0, %xmm1
@@ -2264,9 +2258,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE2-NEXT: movdqa {{.*#+}} xmm1 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
; CHECK-SSE2-NEXT: pmaxub %xmm0, %xmm1
; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm1
-; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm3
-; CHECK-SSE2-NEXT: pandn %xmm5, %xmm3
-; CHECK-SSE2-NEXT: pcmpeqb %xmm6, %xmm2
+; CHECK-SSE2-NEXT: pxor %xmm0, %xmm0
+; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm3
+; CHECK-SSE2-NEXT: pandn %xmm4, %xmm3
+; CHECK-SSE2-NEXT: pcmpeqb %xmm0, %xmm2
; CHECK-SSE2-NEXT: pandn %xmm1, %xmm2
; CHECK-SSE2-NEXT: pmovmskb %xmm2, %ecx
; CHECK-SSE2-NEXT: pmovmskb %xmm3, %edx
@@ -2279,34 +2274,31 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41: # %bb.0:
; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm4
; CHECK-SSE41-NEXT: movq %rdi, %rax
-; CHECK-SSE41-NEXT: movdqa %xmm1, %xmm0
-; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
-; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
+; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
+; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm5 = [255,255,255,255,255,255,255,255]
+; CHECK-SSE41-NEXT: pand %xmm5, %xmm1
+; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: pand %xmm5, %xmm0
-; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm6 = xmm1[0],zero,xmm1[1],zero,xmm1[2],zero,xmm1[3],zero,xmm1[4],zero,xmm1[5],zero,xmm1[6],zero,xmm1[7],zero
-; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE41-NEXT: pand %xmm5, %xmm6
-; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6
-; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE41-NEXT: movdqa %xmm6, %xmm0
-; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm0 = xmm0[8],xmm6[8],xmm0[9],xmm6[9],xmm0[10],xmm6[10],xmm0[11],xmm6[11],xmm0[12],xmm6[12],xmm0[13],xmm6[13],xmm0[14],xmm6[14],xmm0[15],xmm6[15]
+; CHECK-SSE41-NEXT: packuswb %xmm1, %xmm0
+; CHECK-SSE41-NEXT: paddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; CHECK-SSE41-NEXT: movdqa %xmm0, %xmm1
+; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm1 = xmm1[8],xmm0[8],xmm1[9],xmm0[9],xmm1[10],xmm0[10],xmm1[11],xmm0[11],xmm1[12],xmm0[12],xmm1[13],xmm0[13],xmm1[14],xmm0[14],xmm1[15],xmm0[15]
+; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; CHECK-SSE41-NEXT: psrlw $8, %xmm1
+; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm0 = xmm0[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
; CHECK-SSE41-NEXT: psrlw $8, %xmm0
-; CHECK-SSE41-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0,0,1,1,2,2,3,3,4,4,5,5,6,6,7,7]
-; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm6
-; CHECK-SSE41-NEXT: psrlw $8, %xmm6
-; CHECK-SSE41-NEXT: packuswb %xmm0, %xmm6
-; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm0 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
-; CHECK-SSE41-NEXT: pminub %xmm6, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm0
-; CHECK-SSE41-NEXT: pcmpeqd %xmm7, %xmm7
-; CHECK-SSE41-NEXT: pxor %xmm0, %xmm7
-; CHECK-SSE41-NEXT: pand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; CHECK-SSE41-NEXT: pxor %xmm6, %xmm6
-; CHECK-SSE41-NEXT: pcmpgtb %xmm6, %xmm1
+; CHECK-SSE41-NEXT: packuswb %xmm1, %xmm0
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [84,2,36,42,2,1,2,4,2,255,4,36,127,31,2,2]
+; CHECK-SSE41-NEXT: pminub %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm1
+; CHECK-SSE41-NEXT: pcmpeqd %xmm6, %xmm6
+; CHECK-SSE41-NEXT: pxor %xmm1, %xmm6
; CHECK-SSE41-NEXT: movaps {{.*#+}} xmm0 = [255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
-; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm7, %xmm1
+; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm1 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128]
+; CHECK-SSE41-NEXT: pblendvb %xmm0, %xmm6, %xmm1
; CHECK-SSE41-NEXT: pmovzxbw {{.*#+}} xmm0 = xmm4[0],zero,xmm4[1],zero,xmm4[2],zero,xmm4[3],zero,xmm4[4],zero,xmm4[5],zero,xmm4[6],zero,xmm4[7],zero
; CHECK-SSE41-NEXT: punpckhbw {{.*#+}} xmm4 = xmm4[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15]
; CHECK-SSE41-NEXT: pmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm4
@@ -2326,9 +2318,10 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
; CHECK-SSE41-NEXT: movdqa {{.*#+}} xmm4 = [19,51,13,7,128,32,128,3,5,5,51,37,3,128,85,5]
; CHECK-SSE41-NEXT: pmaxub %xmm0, %xmm4
; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm4
-; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm3
+; CHECK-SSE41-NEXT: pxor %xmm0, %xmm0
+; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm3
; CHECK-SSE41-NEXT: pandn %xmm1, %xmm3
-; CHECK-SSE41-NEXT: pcmpeqb %xmm6, %xmm2
+; CHECK-SSE41-NEXT: pcmpeqb %xmm0, %xmm2
; CHECK-SSE41-NEXT: pandn %xmm4, %xmm2
; CHECK-SSE41-NEXT: pmovmskb %xmm2, %ecx
; CHECK-SSE41-NEXT: pmovmskb %xmm3, %edx
diff --git a/llvm/test/CodeGen/X86/ssub_sat_vec.ll b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
index f3a19505c92dbc..7aae10fce6b393 100644
--- a/llvm/test/CodeGen/X86/ssub_sat_vec.ll
+++ b/llvm/test/CodeGen/X86/ssub_sat_vec.ll
@@ -644,54 +644,46 @@ define <2 x i32> @v2i32(<2 x i32> %x, <2 x i32> %y) nounwind {
;
; SSE41-LABEL: v2i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v2i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v2i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX512F-NEXT: retq
;
@@ -745,54 +737,46 @@ define <4 x i32> @v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; SSE41-LABEL: v4i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: pxor %xmm3, %xmm3
; SSE41-NEXT: movdqa %xmm0, %xmm2
; SSE41-NEXT: psubd %xmm1, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm1
; SSE41-NEXT: pcmpgtd %xmm2, %xmm0
; SSE41-NEXT: pxor %xmm1, %xmm0
; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
; SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
-; SSE41-NEXT: movaps %xmm2, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movaps %xmm1, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v4i32:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vpsrad $31, %xmm1, %xmm2
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
+; AVX1-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vpsrad $31, %xmm2, %xmm1
+; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
; AVX1-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX2-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX2-NEXT: vpxor %xmm3, %xmm1, %xmm1
; AVX2-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v4i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT: vpcmpgtd %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vpsrad $31, %xmm1, %xmm2
+; AVX512F-NEXT: vpsubd %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vpcmpgtd %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vpsrad $31, %xmm2, %xmm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} xmm3 = [2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT: vpxor %xmm3, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm3, %xmm1, %xmm1
; AVX512F-NEXT: vblendvps %xmm0, %xmm2, %xmm1, %xmm0
; AVX512F-NEXT: retq
;
@@ -870,29 +854,26 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
;
; SSE41-LABEL: v8i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm1, %xmm5
-; SSE41-NEXT: pxor %xmm6, %xmm6
-; SSE41-NEXT: movdqa %xmm0, %xmm4
-; SSE41-NEXT: psubd %xmm2, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm4, %xmm0
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psubd %xmm2, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm2, %xmm0
-; SSE41-NEXT: movdqa %xmm4, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: psrad $31, %xmm2
+; SSE41-NEXT: movdqa {{.*#+}} xmm5 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm5, %xmm2
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm2
+; SSE41-NEXT: movdqa %xmm4, %xmm6
+; SSE41-NEXT: psubd %xmm3, %xmm6
+; SSE41-NEXT: pcmpgtd %xmm6, %xmm4
+; SSE41-NEXT: pxor %xmm3, %xmm4
+; SSE41-NEXT: movdqa %xmm6, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm2 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm2, %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4
-; SSE41-NEXT: movdqa %xmm5, %xmm1
-; SSE41-NEXT: psubd %xmm3, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm6, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm5
-; SSE41-NEXT: pxor %xmm3, %xmm5
-; SSE41-NEXT: movdqa %xmm1, %xmm3
-; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pxor %xmm2, %xmm3
-; SSE41-NEXT: movdqa %xmm5, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm1
-; SSE41-NEXT: movaps %xmm4, %xmm0
+; SSE41-NEXT: pxor %xmm5, %xmm1
+; SSE41-NEXT: movdqa %xmm4, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm6, %xmm1
+; SSE41-NEXT: movaps %xmm2, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v8i32:
@@ -918,27 +899,23 @@ define <8 x i32> @v8i32(<8 x i32> %x, <8 x i32> %y) nounwind {
;
; AVX2-LABEL: v8i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
-; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX2-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm1
; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v8i32:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpcmpgtd %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsrad $31, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubd %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpsrad $31, %ymm2, %ymm1
; AVX512F-NEXT: vpbroadcastd {{.*#+}} ymm3 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vblendvps %ymm0, %ymm2, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -1060,51 +1037,46 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
;
; SSE41-LABEL: v16i32:
; SSE41: # %bb.0:
-; SSE41-NEXT: movdqa %xmm3, %xmm11
-; SSE41-NEXT: movdqa %xmm2, %xmm10
-; SSE41-NEXT: movdqa %xmm1, %xmm9
-; SSE41-NEXT: pxor %xmm12, %xmm12
-; SSE41-NEXT: movdqa %xmm0, %xmm8
-; SSE41-NEXT: psubd %xmm4, %xmm8
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm4
-; SSE41-NEXT: pcmpgtd %xmm8, %xmm0
+; SSE41-NEXT: movdqa %xmm3, %xmm10
+; SSE41-NEXT: movdqa %xmm2, %xmm9
+; SSE41-NEXT: movdqa %xmm1, %xmm8
+; SSE41-NEXT: movdqa %xmm0, %xmm1
+; SSE41-NEXT: psubd %xmm4, %xmm1
+; SSE41-NEXT: pcmpgtd %xmm1, %xmm0
; SSE41-NEXT: pxor %xmm4, %xmm0
-; SSE41-NEXT: movdqa %xmm8, %xmm1
+; SSE41-NEXT: movdqa %xmm1, %xmm4
+; SSE41-NEXT: psrad $31, %xmm4
+; SSE41-NEXT: movdqa {{.*#+}} xmm11 = [2147483648,2147483648,2147483648,2147483648]
+; SSE41-NEXT: pxor %xmm11, %xmm4
+; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm4
+; SSE41-NEXT: movdqa %xmm8, %xmm2
+; SSE41-NEXT: psubd %xmm5, %xmm2
+; SSE41-NEXT: pcmpgtd %xmm2, %xmm8
+; SSE41-NEXT: pxor %xmm5, %xmm8
+; SSE41-NEXT: movdqa %xmm2, %xmm1
; SSE41-NEXT: psrad $31, %xmm1
-; SSE41-NEXT: movdqa {{.*#+}} xmm4 = [2147483648,2147483648,2147483648,2147483648]
-; SSE41-NEXT: pxor %xmm4, %xmm1
-; SSE41-NEXT: blendvps %xmm0, %xmm1, %xmm8
-; SSE41-NEXT: movdqa %xmm9, %xmm1
-; SSE41-NEXT: psubd %xmm5, %xmm1
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm5
-; SSE41-NEXT: pcmpgtd %xmm1, %xmm9
-; SSE41-NEXT: pxor %xmm5, %xmm9
-; SSE41-NEXT: movdqa %xmm1, %xmm2
+; SSE41-NEXT: pxor %xmm11, %xmm1
+; SSE41-NEXT: movdqa %xmm8, %xmm0
+; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
+; SSE41-NEXT: movdqa %xmm9, %xmm3
+; SSE41-NEXT: psubd %xmm6, %xmm3
+; SSE41-NEXT: pcmpgtd %xmm3, %xmm9
+; SSE41-NEXT: pxor %xmm6, %xmm9
+; SSE41-NEXT: movdqa %xmm3, %xmm2
; SSE41-NEXT: psrad $31, %xmm2
-; SSE41-NEXT: pxor %xmm4, %xmm2
+; SSE41-NEXT: pxor %xmm11, %xmm2
; SSE41-NEXT: movdqa %xmm9, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm2, %xmm1
-; SSE41-NEXT: movdqa %xmm10, %xmm2
-; SSE41-NEXT: psubd %xmm6, %xmm2
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm6
-; SSE41-NEXT: pcmpgtd %xmm2, %xmm10
-; SSE41-NEXT: pxor %xmm6, %xmm10
-; SSE41-NEXT: movdqa %xmm2, %xmm3
+; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
+; SSE41-NEXT: movdqa %xmm10, %xmm5
+; SSE41-NEXT: psubd %xmm7, %xmm5
+; SSE41-NEXT: pcmpgtd %xmm5, %xmm10
+; SSE41-NEXT: pxor %xmm7, %xmm10
+; SSE41-NEXT: movdqa %xmm5, %xmm3
; SSE41-NEXT: psrad $31, %xmm3
-; SSE41-NEXT: pxor %xmm4, %xmm3
+; SSE41-NEXT: pxor %xmm11, %xmm3
; SSE41-NEXT: movdqa %xmm10, %xmm0
-; SSE41-NEXT: blendvps %xmm0, %xmm3, %xmm2
-; SSE41-NEXT: movdqa %xmm11, %xmm3
-; SSE41-NEXT: psubd %xmm7, %xmm3
-; SSE41-NEXT: pcmpgtd %xmm12, %xmm7
-; SSE41-NEXT: pcmpgtd %xmm3, %xmm11
-; SSE41-NEXT: pxor %xmm7, %xmm11
-; SSE41-NEXT: movdqa %xmm3, %xmm5
-; SSE41-NEXT: psrad $31, %xmm5
-; SSE41-NEXT: pxor %xmm4, %xmm5
-; SSE41-NEXT: movdqa %xmm11, %xmm0
; SSE41-NEXT: blendvps %xmm0, %xmm5, %xmm3
-; SSE41-NEXT: movaps %xmm8, %xmm0
+; SSE41-NEXT: movaps %xmm4, %xmm0
; SSE41-NEXT: retq
;
; AVX1-LABEL: v16i32:
@@ -1147,21 +1119,18 @@ define <16 x i32> @v16i32(<16 x i32> %x, <16 x i32> %y) nounwind {
;
; AVX2-LABEL: v16i32:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX2-NEXT: vpcmpgtd %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtd %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpsrad $31, %ymm2, %ymm5
-; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm6 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
-; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vblendvps %ymm0, %ymm5, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtd %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrad $31, %ymm3, %ymm2
-; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
+; AVX2-NEXT: vpsubd %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vpcmpgtd %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpsrad $31, %ymm4, %ymm2
+; AVX2-NEXT: vpbroadcastd {{.*#+}} ymm5 = [2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648,2147483648]
+; AVX2-NEXT: vpxor %ymm5, %ymm2, %ymm2
+; AVX2-NEXT: vblendvps %ymm0, %ymm4, %ymm2, %ymm0
+; AVX2-NEXT: vpsubd %ymm3, %ymm1, %ymm2
+; AVX2-NEXT: vpcmpgtd %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrad $31, %ymm2, %ymm3
+; AVX2-NEXT: vpxor %ymm5, %ymm3, %ymm3
; AVX2-NEXT: vblendvps %ymm1, %ymm2, %ymm3, %ymm1
; AVX2-NEXT: retq
;
@@ -1284,41 +1253,35 @@ define <2 x i64> @v2i64(<2 x i64> %x, <2 x i64> %y) nounwind {
;
; AVX1-LABEL: v2i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX1-NEXT: # xmm2 = mem[0,0]
-; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX1-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX1-NEXT: # xmm3 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v2i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX2-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX2-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX2-NEXT: # xmm2 = mem[0,0]
-; AVX2-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX2-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX2-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX2-NEXT: # xmm3 = mem[0,0]
+; AVX2-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX2-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v2i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm2
-; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm1
-; AVX512F-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX512F-NEXT: vpxor %xmm0, %xmm2, %xmm0
-; AVX512F-NEXT: vmovddup {{.*#+}} xmm2 = [9223372036854775808,9223372036854775808]
-; AVX512F-NEXT: # xmm2 = mem[0,0]
-; AVX512F-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2, %xmm2
-; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm1, %xmm0
+; AVX512F-NEXT: vpsubq %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: vmovddup {{.*#+}} xmm3 = [9223372036854775808,9223372036854775808]
+; AVX512F-NEXT: # xmm3 = mem[0,0]
+; AVX512F-NEXT: vblendvpd %xmm2, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
+; AVX512F-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
+; AVX512F-NEXT: vpxor %xmm0, %xmm1, %xmm0
+; AVX512F-NEXT: vblendvpd %xmm0, %xmm2, %xmm3, %xmm0
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: v2i64:
@@ -1527,48 +1490,46 @@ define <4 x i64> @v4i64(<4 x i64> %x, <4 x i64> %y) nounwind {
; AVX1-LABEL: v4i64:
; AVX1: # %bb.0:
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
-; AVX1-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm5
-; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm5, %ymm4
-; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm5
-; AVX1-NEXT: vpsubq %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm5, %xmm5
+; AVX1-NEXT: vpcmpeqd %xmm3, %xmm3, %xmm3
+; AVX1-NEXT: vpxor %xmm3, %xmm2, %xmm4
+; AVX1-NEXT: vpxor %xmm3, %xmm1, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm3, %ymm3
+; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm4
+; AVX1-NEXT: vpsubq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm4
; AVX1-NEXT: vpsubq %xmm1, %xmm0, %xmm1
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm0, %ymm0
-; AVX1-NEXT: vxorpd %ymm0, %ymm4, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm4
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm3, %xmm1
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm0, %ymm0
+; AVX1-NEXT: vxorpd %ymm0, %ymm3, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm3
+; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm1, %ymm1
; AVX1-NEXT: vxorpd {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
-; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm4, %ymm0
+; AVX1-NEXT: vblendvpd %ymm0, %ymm1, %ymm3, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: v4i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm3, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm2
+; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX2-NEXT: retq
;
; AVX512F-LABEL: v4i64:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm2
-; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX512F-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm0
-; AVX512F-NEXT: vpxor %ymm0, %ymm2, %ymm0
-; AVX512F-NEXT: vpsraq $63, %zmm1, %zmm2
+; AVX512F-NEXT: vpsubq %ymm1, %ymm0, %ymm2
+; AVX512F-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpxor %ymm0, %ymm1, %ymm0
+; AVX512F-NEXT: vpsraq $63, %zmm2, %zmm1
; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX512F-NEXT: vpxor %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpxor %ymm3, %ymm1, %ymm1
; AVX512F-NEXT: vblendvpd %ymm0, %ymm2, %ymm1, %ymm0
; AVX512F-NEXT: retq
;
@@ -1943,62 +1904,61 @@ define <8 x i64> @v8i64(<8 x i64> %x, <8 x i64> %y) nounwind {
;
; AVX1-LABEL: v8i64:
; AVX1: # %bb.0:
-; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm5
-; AVX1-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm5, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm7
+; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm4
+; AVX1-NEXT: vpcmpeqd %xmm5, %xmm5, %xmm5
+; AVX1-NEXT: vpxor %xmm5, %xmm4, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm7
; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm7
-; AVX1-NEXT: vpsubq %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm7, %xmm7
+; AVX1-NEXT: vpsubq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm7
; AVX1-NEXT: vpsubq %xmm2, %xmm0, %xmm2
; AVX1-NEXT: vpcmpgtq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm0, %ymm0
; AVX1-NEXT: vxorpd %ymm0, %ymm6, %ymm0
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm6
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm4, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm5, %ymm2, %ymm2
-; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm5 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm6
+; AVX1-NEXT: vpxor %xmm7, %xmm7, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm7, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
+; AVX1-NEXT: vbroadcastsd {{.*#+}} ymm4 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
+; AVX1-NEXT: vxorpd %ymm4, %ymm2, %ymm2
; AVX1-NEXT: vblendvpd %ymm0, %ymm2, %ymm6, %ymm0
; AVX1-NEXT: vextractf128 $1, %ymm3, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm6
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm7
-; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm7, %ymm6
-; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
-; AVX1-NEXT: vpsubq %xmm2, %xmm7, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm7
+; AVX1-NEXT: vpxor %xmm5, %xmm2, %xmm6
+; AVX1-NEXT: vpxor %xmm5, %xmm3, %xmm5
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm5, %ymm5
+; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
+; AVX1-NEXT: vpsubq %xmm2, %xmm6, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm6, %xmm6
; AVX1-NEXT: vpsubq %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vinsertf128 $1, %xmm7, %ymm1, %ymm1
-; AVX1-NEXT: vxorpd %ymm1, %ymm6, %ymm1
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm6
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vinsertf128 $1, %xmm6, %ymm1, %ymm1
+; AVX1-NEXT: vxorpd %ymm1, %ymm5, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm5
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm7, %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm7, %xmm3
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
-; AVX1-NEXT: vxorpd %ymm5, %ymm2, %ymm2
-; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm6, %ymm1
+; AVX1-NEXT: vxorpd %ymm4, %ymm2, %ymm2
+; AVX1-NEXT: vblendvpd %ymm1, %ymm2, %ymm5, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: v8i64:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpxor %xmm4, %xmm4, %xmm4
+; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm4
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm0
+; AVX2-NEXT: vpxor %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpxor %xmm2, %xmm2, %xmm2
; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm5
-; AVX2-NEXT: vpsubq %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpxor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm4, %ymm5
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vblendvpd %ymm0, %ymm5, %ymm2, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm2
-; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpxor %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm4, %ymm2
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm5, %ymm0
+; AVX2-NEXT: vpsubq %ymm3, %ymm1, %ymm4
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpxor %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpxor %ymm6, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm1, %ymm4, %ymm2, %ymm1
; AVX2-NEXT: retq
;
; AVX512-LABEL: v8i64:
diff --git a/llvm/test/CodeGen/X86/var-permute-256.ll b/llvm/test/CodeGen/X86/var-permute-256.ll
index 56dc2f0571b170..487c2d8c460b7a 100644
--- a/llvm/test/CodeGen/X86/var-permute-256.ll
+++ b/llvm/test/CodeGen/X86/var-permute-256.ll
@@ -1171,8 +1171,8 @@ define <4 x i64> @PR50356(<4 x i64> %0, <4 x i32> %1, <4 x i64> %2) unnamed_addr
; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vextractf128 $1, %ymm2, %xmm1
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vpxor %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX1-NEXT: vmovapd {{.*#+}} ymm1 = [34,68,102,136]
; AVX1-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm0
diff --git a/llvm/test/CodeGen/X86/vector-pcmp.ll b/llvm/test/CodeGen/X86/vector-pcmp.ll
index fbd7df1c839e97..1f2abc50731786 100644
--- a/llvm/test/CodeGen/X86/vector-pcmp.ll
+++ b/llvm/test/CodeGen/X86/vector-pcmp.ll
@@ -1957,14 +1957,14 @@ define <4 x i64> @PR52504(<4 x i16> %t3) {
; SSE42-LABEL: PR52504:
; SSE42: # %bb.0:
; SSE42-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,1,1,1]
-; SSE42-NEXT: pmovsxwq %xmm1, %xmm2
-; SSE42-NEXT: pmovsxwq %xmm0, %xmm3
-; SSE42-NEXT: pxor %xmm1, %xmm1
-; SSE42-NEXT: pxor %xmm0, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm3, %xmm0
-; SSE42-NEXT: por %xmm3, %xmm0
-; SSE42-NEXT: pcmpgtq %xmm2, %xmm1
-; SSE42-NEXT: por %xmm2, %xmm1
+; SSE42-NEXT: pmovsxwq %xmm1, %xmm1
+; SSE42-NEXT: pmovsxwq %xmm0, %xmm2
+; SSE42-NEXT: pcmpeqd %xmm3, %xmm3
+; SSE42-NEXT: movdqa %xmm2, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm2
+; SSE42-NEXT: movdqa %xmm1, %xmm0
+; SSE42-NEXT: blendvpd %xmm0, %xmm3, %xmm1
+; SSE42-NEXT: movapd %xmm2, %xmm0
; SSE42-NEXT: retq
;
; AVX1-LABEL: PR52504:
@@ -1972,20 +1972,17 @@ define <4 x i64> @PR52504(<4 x i16> %t3) {
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm1
; AVX1-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,1,1,1]
; AVX1-NEXT: vpmovsxwq %xmm0, %xmm0
-; AVX1-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm3
-; AVX1-NEXT: vpor %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm2
-; AVX1-NEXT: vpor %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpcmpeqd %xmm2, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm0, %xmm2, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm1, %xmm2, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm0, %ymm1, %ymm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: PR52504:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxwq %xmm0, %ymm0
-; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm1
-; AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpcmpeqd %ymm1, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: PR52504:
diff --git a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
index d276a6873012a9..1196393159ce5e 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-ssat.ll
@@ -85,10 +85,7 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) {
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: retq
;
@@ -97,9 +94,7 @@ define <2 x i32> @trunc_ssat_v2i64_v2i32(<2 x i64> %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: retq
;
@@ -209,10 +204,7 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; AVX1-NEXT: # xmm1 = mem[0,0]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vmovlpd %xmm0, (%rdi)
; AVX1-NEXT: retq
@@ -222,9 +214,7 @@ define void @trunc_ssat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [2147483647,2147483647]
; AVX2-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
-; AVX2-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-NEXT: vmovlpd %xmm0, (%rdi)
; AVX2-NEXT: retq
@@ -389,10 +379,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744071562067968,18446744071562067968]
; AVX1-NEXT: # xmm1 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm1[0,2],xmm0[0,2]
; AVX1-NEXT: vzeroupper
; AVX1-NEXT: retq
@@ -402,9 +390,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-SLOW-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-SLOW-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-SLOW-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
@@ -415,9 +402,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-FAST-ALL-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -430,9 +416,8 @@ define <4 x i32> @trunc_ssat_v4i64_v4i32(<4 x i64> %a0) {
; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [2147483647,2147483647,2147483647,2147483647]
; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-FAST-PERLANE-NEXT: vzeroupper
@@ -710,14 +695,10 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744071562067968,18446744071562067968]
; AVX1-NEXT: # xmm4 = mem[0,0]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm4, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
+; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm0, %xmm4, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm1, %xmm1
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm1, %ymm1
; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm1[0,2],ymm0[4,6],ymm1[4,6]
@@ -732,11 +713,9 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-SLOW-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-SLOW-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-SLOW-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-SLOW-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
+; AVX2-SLOW-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
@@ -751,11 +730,9 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-ALL-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
+; AVX2-FAST-ALL-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vmovapd {{.*#+}} ymm2 = [0,2,4,6,4,6,6,7]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm2, %ymm0
; AVX2-FAST-ALL-NEXT: vpermps %ymm1, %ymm2, %ymm1
@@ -771,11 +748,9 @@ define <8 x i32> @trunc_ssat_v8i64_v8i32(ptr %p0) "min-legal-vector-width"="256"
; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744071562067968,18446744071562067968,18446744071562067968,18446744071562067968]
+; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
+; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vperm2f128 {{.*#+}} ymm2 = ymm0[2,3],ymm1[2,3]
; AVX2-FAST-PERLANE-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} ymm0 = ymm0[0,2],ymm2[0,2],ymm0[4,6],ymm2[4,6]
@@ -875,9 +850,7 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: retq
@@ -887,9 +860,7 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: retq
@@ -899,9 +870,7 @@ define <2 x i16> @trunc_ssat_v2i64_v2i16(<2 x i64> %a0) {
; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767]
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-FAST-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: retq
;
@@ -1010,9 +979,7 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767]
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovd %xmm0, (%rdi)
@@ -1023,9 +990,7 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767]
; AVX2-SLOW-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-SLOW-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX2-SLOW-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-SLOW-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-SLOW-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
@@ -1036,9 +1001,7 @@ define void @trunc_ssat_v2i64_v2i16_store(<2 x i64> %a0, ptr%p1) {
; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [32767,32767]
; AVX2-FAST-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX2-FAST-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX2-FAST-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX2-FAST-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX2-FAST-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
; AVX2-FAST-NEXT: retq
@@ -1202,11 +1165,10 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vzeroupper
@@ -1217,9 +1179,8 @@ define <4 x i16> @trunc_ssat_v4i64_v4i16(<4 x i64> %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
@@ -1386,11 +1347,10 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vmovq %xmm0, (%rdi)
@@ -1402,9 +1362,8 @@ define void @trunc_ssat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [32767,32767,32767,32767]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
@@ -1683,16 +1642,13 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpmovsxwq {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709518848,18446744073709518848]
+; AVX1-NEXT: # xmm4 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm0, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm3, %xmm1
+; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: retq
@@ -1706,11 +1662,9 @@ define <8 x i16> @trunc_ssat_v8i64_v8i16(ptr %p0) "min-legal-vector-width"="256"
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744073709518848,18446744073709518848,18446744073709518848,18446744073709518848]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
@@ -2028,9 +1982,7 @@ define <2 x i8> @trunc_ssat_v2i64_v2i8(<2 x i64> %a0) {
; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127]
; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
;
@@ -2174,9 +2126,7 @@ define void @trunc_ssat_v2i64_v2i8_store(<2 x i64> %a0, ptr%p1) {
; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127]
; AVX-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm2
-; AVX-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -2342,11 +2292,10 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
@@ -2358,9 +2307,8 @@ define <4 x i8> @trunc_ssat_v4i64_v4i8(<4 x i64> %a0) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
@@ -2530,11 +2478,10 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm3
; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm0, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm0, %xmm1, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm1 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm1 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm0, %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
@@ -2547,9 +2494,8 @@ define void @trunc_ssat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [127,127,127,127]
; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackssdw %xmm0, %xmm0, %xmm0
@@ -2831,16 +2777,13 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm4 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm0, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm3, %xmm1
+; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
@@ -2855,11 +2798,9 @@ define <8 x i8> @trunc_ssat_v8i64_v8i8(ptr %p0) "min-legal-vector-width"="256" {
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
@@ -3127,16 +3068,13 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
; AVX1-NEXT: vpcmpgtq %xmm1, %xmm4, %xmm5
; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm1, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm0, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm0, %xmm4, %xmm0
+; AVX1-NEXT: vmovddup {{.*#+}} xmm4 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm4 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm1, %xmm4, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm0, %xmm4, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm3, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm3, %xmm4, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm4, %xmm2
+; AVX1-NEXT: vblendvpd %xmm3, %xmm4, %xmm3, %xmm1
+; AVX1-NEXT: vblendvpd %xmm2, %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpackssdw %xmm1, %xmm2, %xmm1
; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vpacksswb %xmm0, %xmm0, %xmm0
@@ -3152,11 +3090,9 @@ define void @trunc_ssat_v8i64_v8i8_store(ptr %p0, ptr%p1) "min-legal-vector-widt
; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm2, %ymm3
; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm2, %ymm0, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm2 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm2, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextracti128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
@@ -3610,54 +3546,47 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; AVX1-LABEL: trunc_ssat_v16i64_v16i8:
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm0
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm2 = [127,127]
-; AVX1-NEXT: vpcmpgtq %xmm0, %xmm2, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, %xmm0, %xmm2, %xmm0
-; AVX1-NEXT: vmovdqa 112(%rdi), %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm1, %xmm2, %xmm3
-; AVX1-NEXT: vblendvpd %xmm3, %xmm1, %xmm2, %xmm1
+; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm1 = [127,127]
+; AVX1-NEXT: vpcmpgtq %xmm0, %xmm1, %xmm2
+; AVX1-NEXT: vblendvpd %xmm2, %xmm0, %xmm1, %xmm0
+; AVX1-NEXT: vmovdqa 112(%rdi), %xmm2
+; AVX1-NEXT: vpcmpgtq %xmm2, %xmm1, %xmm3
+; AVX1-NEXT: vblendvpd %xmm3, %xmm2, %xmm1, %xmm2
; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm3, %xmm2, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm2, %xmm3
+; AVX1-NEXT: vpcmpgtq %xmm3, %xmm1, %xmm4
+; AVX1-NEXT: vblendvpd %xmm4, %xmm3, %xmm1, %xmm3
; AVX1-NEXT: vmovdqa 80(%rdi), %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm4, %xmm2, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm2, %xmm4
+; AVX1-NEXT: vpcmpgtq %xmm4, %xmm1, %xmm5
+; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm1, %xmm4
; AVX1-NEXT: vmovdqa (%rdi), %xmm5
; AVX1-NEXT: vmovdqa 16(%rdi), %xmm6
; AVX1-NEXT: vmovdqa 32(%rdi), %xmm7
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm7, %xmm2, %xmm9
-; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm2, %xmm7
-; AVX1-NEXT: vpcmpgtq %xmm8, %xmm2, %xmm9
-; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm2, %xmm8
-; AVX1-NEXT: vpcmpgtq %xmm5, %xmm2, %xmm9
-; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm2, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
-; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxbq {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm2, %xmm9
-; AVX1-NEXT: vblendvpd %xmm9, %xmm2, %xmm6, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm5, %xmm9
-; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm6, %xmm5
-; AVX1-NEXT: vpackssdw %xmm2, %xmm5, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm8, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm8, %xmm6, %xmm5
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm7, %xmm8
-; AVX1-NEXT: vblendvpd %xmm8, %xmm7, %xmm6, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm7, %xmm1, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm7, %xmm1, %xmm7
+; AVX1-NEXT: vpcmpgtq %xmm8, %xmm1, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm8, %xmm1, %xmm8
+; AVX1-NEXT: vpcmpgtq %xmm5, %xmm1, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm5, %xmm1, %xmm5
+; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm9
+; AVX1-NEXT: vblendvpd %xmm9, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vmovddup {{.*#+}} xmm6 = [18446744073709551488,18446744073709551488]
+; AVX1-NEXT: # xmm6 = mem[0,0]
+; AVX1-NEXT: vblendvpd %xmm1, %xmm6, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm5, %xmm6, %xmm5, %xmm5
+; AVX1-NEXT: vpackssdw %xmm1, %xmm5, %xmm1
+; AVX1-NEXT: vblendvpd %xmm8, %xmm6, %xmm8, %xmm5
+; AVX1-NEXT: vblendvpd %xmm7, %xmm6, %xmm7, %xmm7
; AVX1-NEXT: vpackssdw %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpackssdw %xmm5, %xmm2, %xmm2
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm4, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm3, %xmm5
-; AVX1-NEXT: vblendvpd %xmm5, %xmm3, %xmm6, %xmm3
+; AVX1-NEXT: vpackssdw %xmm5, %xmm1, %xmm1
+; AVX1-NEXT: vblendvpd %xmm4, %xmm6, %xmm4, %xmm4
+; AVX1-NEXT: vblendvpd %xmm3, %xmm6, %xmm3, %xmm3
; AVX1-NEXT: vpackssdw %xmm4, %xmm3, %xmm3
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm1, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm1, %xmm6, %xmm1
-; AVX1-NEXT: vpcmpgtq %xmm6, %xmm0, %xmm4
-; AVX1-NEXT: vblendvpd %xmm4, %xmm0, %xmm6, %xmm0
-; AVX1-NEXT: vpackssdw %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm2, %xmm6, %xmm2, %xmm2
+; AVX1-NEXT: vblendvpd %xmm0, %xmm6, %xmm0, %xmm0
+; AVX1-NEXT: vpackssdw %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpackssdw %xmm0, %xmm3, %xmm0
-; AVX1-NEXT: vpacksswb %xmm0, %xmm2, %xmm0
+; AVX1-NEXT: vpacksswb %xmm0, %xmm1, %xmm0
; AVX1-NEXT: retq
;
; AVX2-LABEL: trunc_ssat_v16i64_v16i8:
@@ -3675,16 +3604,12 @@ define <16 x i8> @trunc_ssat_v16i64_v16i8(ptr %p0) "min-legal-vector-width"="256
; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpcmpgtq %ymm1, %ymm4, %ymm5
; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm1, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm1, %ymm4, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm0, %ymm5
-; AVX2-NEXT: vblendvpd %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm4 = [18446744073709551488,18446744073709551488,18446744073709551488,18446744073709551488]
+; AVX2-NEXT: vblendvpd %ymm1, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vblendvpd %ymm0, %ymm4, %ymm0, %ymm0
; AVX2-NEXT: vpackssdw %ymm1, %ymm0, %ymm0
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm3, %ymm1
-; AVX2-NEXT: vblendvpd %ymm1, %ymm3, %ymm4, %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm4, %ymm2, %ymm3
-; AVX2-NEXT: vblendvpd %ymm3, %ymm2, %ymm4, %ymm2
+; AVX2-NEXT: vblendvpd %ymm3, %ymm4, %ymm3, %ymm1
+; AVX2-NEXT: vblendvpd %ymm2, %ymm4, %ymm2, %ymm2
; AVX2-NEXT: vpackssdw %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,2,1,3]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,2,1,3]
diff --git a/llvm/test/CodeGen/X86/vector-trunc-usat.ll b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
index 4126616937473e..c7df66bb8ab20e 100644
--- a/llvm/test/CodeGen/X86/vector-trunc-usat.ll
+++ b/llvm/test/CodeGen/X86/vector-trunc-usat.ll
@@ -52,9 +52,7 @@ define <2 x i32> @trunc_usat_v2i64_v2i32(<2 x i64> %a0) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i32:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: retq
;
@@ -129,9 +127,7 @@ define void @trunc_usat_v2i64_v2i32_store(<2 x i64> %a0, ptr %p1) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i32_store:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX-NEXT: vmovlpd %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -252,11 +248,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
;
; AVX2-SLOW-LABEL: trunc_usat_v4i64_v4i32:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-SLOW-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-SLOW-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-SLOW-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
-; AVX2-SLOW-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-SLOW-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-SLOW-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-SLOW-NEXT: vzeroupper
@@ -264,11 +256,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
;
; AVX2-FAST-ALL-LABEL: trunc_usat_v4i64_v4i32:
; AVX2-FAST-ALL: # %bb.0:
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-ALL-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-FAST-ALL-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-FAST-ALL-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
-; AVX2-FAST-ALL-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-ALL-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-FAST-ALL-NEXT: vbroadcastf128 {{.*#+}} ymm1 = [0,2,4,6,0,2,4,6]
; AVX2-FAST-ALL-NEXT: # ymm1 = mem[0,1,0,1]
; AVX2-FAST-ALL-NEXT: vpermps %ymm0, %ymm1, %ymm0
@@ -278,11 +266,7 @@ define <4 x i32> @trunc_usat_v4i64_v4i32(<4 x i64> %a0) {
;
; AVX2-FAST-PERLANE-LABEL: trunc_usat_v4i64_v4i32:
; AVX2-FAST-PERLANE: # %bb.0:
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm1 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-FAST-PERLANE-NEXT: vpxor %ymm1, %ymm0, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372041149743102,9223372041149743102,9223372041149743102,9223372041149743102]
-; AVX2-FAST-PERLANE-NEXT: vpcmpgtq %ymm2, %ymm1, %ymm1
-; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; AVX2-FAST-PERLANE-NEXT: vblendvpd %ymm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
; AVX2-FAST-PERLANE-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-FAST-PERLANE-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
; AVX2-FAST-PERLANE-NEXT: vzeroupper
@@ -592,27 +576,21 @@ define <2 x i16> @trunc_usat_v2i64_v2i16(<2 x i64> %a0) {
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: retq
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-SLOW-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: retq
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-FAST-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-FAST-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,8,9,10,11,8,9,10,11,12,13,14,15]
; AVX2-FAST-NEXT: retq
;
@@ -687,9 +665,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX1-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX1: # %bb.0:
-; AVX1-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX1-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX1-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX1-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX1-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX1-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX1-NEXT: vmovd %xmm0, (%rdi)
@@ -697,9 +673,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX2-SLOW-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-SLOW: # %bb.0:
-; AVX2-SLOW-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-SLOW-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-SLOW-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,2,3]
; AVX2-SLOW-NEXT: vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
; AVX2-SLOW-NEXT: vmovd %xmm0, (%rdi)
@@ -707,9 +681,7 @@ define void @trunc_usat_v2i64_v2i16_store(<2 x i64> %a0, ptr %p1) {
;
; AVX2-FAST-LABEL: trunc_usat_v2i64_v2i16_store:
; AVX2-FAST: # %bb.0:
-; AVX2-FAST-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX2-FAST-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX2-FAST-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX2-FAST-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX2-FAST-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,8,9,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX2-FAST-NEXT: vmovd %xmm0, (%rdi)
; AVX2-FAST-NEXT: retq
@@ -837,11 +809,7 @@ define <4 x i16> @trunc_usat_v4i64_v4i16(<4 x i64> %a0) {
; AVX2-LABEL: trunc_usat_v4i64_v4i16:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -972,11 +940,7 @@ define void @trunc_usat_v4i64_v4i16_store(<4 x i64> %a0, ptr%p1) {
; AVX2-LABEL: trunc_usat_v4i64_v4i16_store:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [65535,65535,65535,65535]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854841342,9223372036854841342,9223372036854841342,9223372036854841342]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -1693,9 +1657,7 @@ define <2 x i8> @trunc_usat_v2i64_v2i8(<2 x i64> %a0) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i8:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: retq
;
@@ -1788,9 +1750,7 @@ define void @trunc_usat_v2i64_v2i8_store(<2 x i64> %a0, ptr %p1) {
;
; AVX-LABEL: trunc_usat_v2i64_v2i8_store:
; AVX: # %bb.0:
-; AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; AVX-NEXT: vpcmpgtq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
-; AVX-NEXT: vblendvpd %xmm1, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
+; AVX-NEXT: vblendvpd %xmm0, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpextrw $0, %xmm0, (%rdi)
; AVX-NEXT: retq
@@ -1918,11 +1878,7 @@ define <4 x i8> @trunc_usat_v4i64_v4i8(<4 x i64> %a0) {
; AVX2-LABEL: trunc_usat_v4i64_v4i8:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,255,255]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776062,9223372036854776062,9223372036854776062,9223372036854776062]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
@@ -2055,11 +2011,7 @@ define void @trunc_usat_v4i64_v4i8_store(<4 x i64> %a0, ptr%p1) {
; AVX2-LABEL: trunc_usat_v4i64_v4i8_store:
; AVX2: # %bb.0:
; AVX2-NEXT: vbroadcastsd {{.*#+}} ymm1 = [255,255,255,255]
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [9223372036854776062,9223372036854776062,9223372036854776062,9223372036854776062]
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vblendvpd %ymm0, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vextractf128 $1, %ymm0, %xmm1
; AVX2-NEXT: vpackusdw %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vpackusdw %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/vsel-cmp-load.ll b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
index bec0b60d95e826..097e645afa894c 100644
--- a/llvm/test/CodeGen/X86/vsel-cmp-load.ll
+++ b/llvm/test/CodeGen/X86/vsel-cmp-load.ll
@@ -55,9 +55,7 @@ define <4 x i64> @ne_zero(ptr %p, <4 x i64> %x, <4 x i64> %y) {
;
; AVX2-LABEL: ne_zero:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxwq {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -186,9 +184,7 @@ define <8 x float> @ne_zero_fp_select(ptr %p, <8 x float> %x, <8 x float> %y) {
;
; AVX2-LABEL: ne_zero_fp_select:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpmovzxbd {{.*#+}} ymm2 = mem[0],zero,zero,zero,mem[1],zero,zero,zero,mem[2],zero,zero,zero,mem[3],zero,zero,zero,mem[4],zero,zero,zero,mem[5],zero,zero,zero,mem[6],zero,zero,zero,mem[7],zero,zero,zero
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtd %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
; AVX2-NEXT: vblendvps %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
;
@@ -220,9 +216,7 @@ define <4 x double> @sgt_zero_fp_select(ptr %p, <4 x double> %x, <4 x double> %y
; AVX2-LABEL: sgt_zero_fp_select:
; AVX2: # %bb.0:
; AVX2-NEXT: vpmovsxbq (%rdi), %ymm2
-; AVX2-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vblendvpd %ymm2, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vblendvpd %ymm2, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
;
; AVX512-LABEL: sgt_zero_fp_select:
More information about the llvm-commits
mailing list