[llvm] CodeGen/X86: add missing mcpu to tests (PR #111865)
Ramkumar Ramachandra via llvm-commits
llvm-commits at lists.llvm.org
Thu Oct 10 11:05:39 PDT 2024
https://github.com/artagnon updated https://github.com/llvm/llvm-project/pull/111865
>From 02540526ef783523c8aeafcb1cee163026766651 Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra at codasip.com>
Date: Thu, 10 Oct 2024 14:45:49 +0100
Subject: [PATCH] CodeGen/X86: add missing mcpu to tests
In order to faciliate working on core CodeGen infrastructure that
improves the schedule of instructions in the default case where there is
no scheduling information available in the machine model, add the
missing -mcpu to existing tests, so that future patches doesn't
unnecessarily change a huge number of tests. Note that the -mcpu
argument is already present on all tests under most targets, including
ARM and AArch64 targets.
Care has been taken to pick the right mcpu for each test, after studying
its intent, and the patch should only affect the schedule of
instructions in each test.
---
.../X86/avx512bw-intrinsics-upgrade.ll | 118 +-
.../test/CodeGen/X86/bitcast-and-setcc-256.ll | 12 +-
.../X86/div-rem-pair-recomposition-signed.ll | 726 ++++----
.../div-rem-pair-recomposition-unsigned.ll | 583 ++++---
llvm/test/CodeGen/X86/fold-tied-op.ll | 119 +-
llvm/test/CodeGen/X86/horizontal-sum.ll | 221 +--
llvm/test/CodeGen/X86/is_fpclass.ll | 31 +-
llvm/test/CodeGen/X86/lea-opt-cse4.ll | 28 +-
llvm/test/CodeGen/X86/machine-cp.ll | 33 +-
llvm/test/CodeGen/X86/madd.ll | 101 +-
.../test/CodeGen/X86/masked_gather_scatter.ll | 520 +++---
.../CodeGen/X86/memcmp-more-load-pairs-x32.ll | 188 +-
.../CodeGen/X86/memcmp-more-load-pairs.ll | 514 +++---
llvm/test/CodeGen/X86/memcmp.ll | 238 +--
llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 320 ++--
llvm/test/CodeGen/X86/mul-constant-result.ll | 4 +-
llvm/test/CodeGen/X86/mul-i512.ll | 1133 ++++++------
llvm/test/CodeGen/X86/mul64.ll | 4 +-
llvm/test/CodeGen/X86/pr62653.ll | 2 +-
llvm/test/CodeGen/X86/rotate-multi.ll | 2 +-
llvm/test/CodeGen/X86/sad.ll | 30 +-
llvm/test/CodeGen/X86/sext-vsetcc.ll | 22 +-
llvm/test/CodeGen/X86/smul_fix.ll | 110 +-
llvm/test/CodeGen/X86/statepoint-live-in.ll | 3 +-
llvm/test/CodeGen/X86/statepoint-regs.ll | 3 +-
llvm/test/CodeGen/X86/ucmp.ll | 122 +-
llvm/test/CodeGen/X86/umul-with-overflow.ll | 71 +-
llvm/test/CodeGen/X86/umul_fix.ll | 153 +-
.../X86/umulo-128-legalisation-lowering.ll | 52 +-
llvm/test/CodeGen/X86/v8i1-masks.ll | 208 +--
.../vector-interleaved-store-i8-stride-5.ll | 1514 ++++++++---------
llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 14 +-
llvm/test/CodeGen/X86/win-smallparams.ll | 8 +-
.../CodeGen/X86/x86-interleaved-access.ll | 84 +-
llvm/test/CodeGen/X86/xmulo.ll | 337 ++--
35 files changed, 3789 insertions(+), 3839 deletions(-)
diff --git a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
index 51ffeca52a6652..fe2e917434a7af 100644
--- a/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
+++ b/llvm/test/CodeGen/X86/avx512bw-intrinsics-upgrade.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+avx512bw --show-mc-encoding | FileCheck %s --check-prefixes=CHECK,X64
declare i32 @llvm.x86.avx512.kunpck.wd(i32, i32)
@@ -1908,16 +1908,17 @@ define i64 @test_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x02]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
-; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x05]
+; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
+; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x64,0xc1]
-; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
+; X64-NEXT: decq %rax # encoding: [0x48,0xff,0xc8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -1994,18 +1995,18 @@ define i64 @test_mask_cmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %mask) nounwin
; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x02]
-; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
+; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpnltb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpgtb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x64,0xc1]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
-; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.cmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2083,16 +2084,17 @@ define i64 @test_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1) nounwind {
; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x02]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
+; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3f,0xc1,0x04]
+; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
-; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x05]
+; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
+; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
-; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0x7d,0x48,0x3e,0xc1,0x06]
-; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
-; X64-NEXT: leaq -1(%rcx,%rax), %rax # encoding: [0x48,0x8d,0x44,0x01,0xff]
+; X64-NEXT: decq %rax # encoding: [0x48,0xff,0xc8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 -1)
@@ -2169,18 +2171,18 @@ define i64 @test_mask_x86_avx512_ucmp_b_512(<64 x i8> %a0, <64 x i8> %a1, i64 %m
; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x02]
-; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
-; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovq %k0, %rdx # encoding: [0xc4,0xe1,0xfb,0x93,0xd0]
-; X64-NEXT: addq %rax, %rdx # encoding: [0x48,0x01,0xc2]
; X64-NEXT: addq %rcx, %rdx # encoding: [0x48,0x01,0xca]
+; X64-NEXT: vpcmpneqb %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3f,0xc1,0x04]
+; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpnltub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x05]
; X64-NEXT: kmovq %k0, %rcx # encoding: [0xc4,0xe1,0xfb,0x93,0xc8]
+; X64-NEXT: addq %rax, %rcx # encoding: [0x48,0x01,0xc1]
; X64-NEXT: vpcmpnleub %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0x7d,0x49,0x3e,0xc1,0x06]
; X64-NEXT: kmovq %k0, %rax # encoding: [0xc4,0xe1,0xfb,0x93,0xc0]
; X64-NEXT: addq %rcx, %rax # encoding: [0x48,0x01,0xc8]
-; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
; X64-NEXT: addq %rdx, %rax # encoding: [0x48,0x01,0xd0]
+; X64-NEXT: addq %rdi, %rax # encoding: [0x48,0x01,0xf8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i64 @llvm.x86.avx512.mask.ucmp.b.512(<64 x i8> %a0, <64 x i8> %a1, i32 0, i64 %mask)
@@ -2213,16 +2215,17 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
-; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
+; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
-; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X86-NEXT: decl %eax # encoding: [0x48]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2235,16 +2238,17 @@ define i32 @test_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x02]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
-; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x05]
+; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
+; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf1,0x7d,0x48,0x65,0xc1]
-; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X64-NEXT: decl %eax # encoding: [0xff,0xc8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
@@ -2277,18 +2281,18 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
-; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6]
; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: popl %esi # encoding: [0x5e]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2302,18 +2306,18 @@ define i32 @test_mask_cmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) nounw
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmplew %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpnltw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x05]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmpgtw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf1,0x7d,0x49,0x65,0xc1]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
-; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.cmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
@@ -2346,16 +2350,17 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; X86-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
-; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
+; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
+; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
-; X86-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X86-NEXT: leal -1(%ecx,%eax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X86-NEXT: decl %eax # encoding: [0x48]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
;
@@ -2368,16 +2373,17 @@ define i32 @test_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1) nounwind {
; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x02]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
+; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3f,0xc1,0x04]
+; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
-; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x05]
+; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
+; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 # encoding: [0x62,0xf3,0xfd,0x48,0x3e,0xc1,0x06]
-; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
-; X64-NEXT: leal -1(%rcx,%rax), %eax # encoding: [0x8d,0x44,0x01,0xff]
+; X64-NEXT: decl %eax # encoding: [0xff,0xc8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 -1)
@@ -2410,18 +2416,18 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
-; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X86-NEXT: kmovd %k0, %esi # encoding: [0xc5,0xfb,0x93,0xf0]
-; X86-NEXT: addl %eax, %esi # encoding: [0x01,0xc6]
; X86-NEXT: addl %edx, %esi # encoding: [0x01,0xd6]
+; X86-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
; X86-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
+; X86-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X86-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
; X86-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X86-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
-; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: addl %esi, %eax # encoding: [0x01,0xf0]
+; X86-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
; X86-NEXT: popl %esi # encoding: [0x5e]
; X86-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X86-NEXT: retl # encoding: [0xc3]
@@ -2435,18 +2441,18 @@ define i32 @test_mask_ucmp_w_512(<32 x i16> %a0, <32 x i16> %a1, i32 %mask) noun
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmpleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x02]
-; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
-; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
; X64-NEXT: kmovd %k0, %edx # encoding: [0xc5,0xfb,0x93,0xd0]
-; X64-NEXT: addl %eax, %edx # encoding: [0x01,0xc2]
; X64-NEXT: addl %ecx, %edx # encoding: [0x01,0xca]
+; X64-NEXT: vpcmpneqw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3f,0xc1,0x04]
+; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: vpcmpnltuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x05]
; X64-NEXT: kmovd %k0, %ecx # encoding: [0xc5,0xfb,0x93,0xc8]
+; X64-NEXT: addl %eax, %ecx # encoding: [0x01,0xc1]
; X64-NEXT: vpcmpnleuw %zmm1, %zmm0, %k0 {%k1} # encoding: [0x62,0xf3,0xfd,0x49,0x3e,0xc1,0x06]
; X64-NEXT: kmovd %k0, %eax # encoding: [0xc5,0xfb,0x93,0xc0]
; X64-NEXT: addl %ecx, %eax # encoding: [0x01,0xc8]
-; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
; X64-NEXT: addl %edx, %eax # encoding: [0x01,0xd0]
+; X64-NEXT: addl %edi, %eax # encoding: [0x01,0xf8]
; X64-NEXT: vzeroupper # encoding: [0xc5,0xf8,0x77]
; X64-NEXT: retq # encoding: [0xc3]
%res0 = call i32 @llvm.x86.avx512.mask.ucmp.w.512(<32 x i16> %a0, <32 x i16> %a1, i32 0, i32 %mask)
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 234c7a0a500d30..88ccfeaf582802 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSE2 | FileCheck %s --check-prefix=SSE2-SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+SSSE3 | FileCheck %s --check-prefix=SSE2-SSSE3
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+SSE2 | FileCheck %s --check-prefix=SSE2-SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+SSSE3 | FileCheck %s --check-prefix=SSE2-SSSE3
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX12,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX12,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefix=AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefix=AVX512BW
define i4 @v4i64(<4 x i64> %a, <4 x i64> %b, <4 x i64> %c, <4 x i64> %d) {
; SSE2-SSSE3-LABEL: v4i64:
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
index f66d81c781fe0d..11923b9108ff40 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-signed.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
; If the target does not have a single div/rem operation,
; -div-rem-pairs pass will decompose the remainder calculation as:
@@ -122,29 +122,32 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %ebp
; X86-NEXT: calll __divdi3
; X86-NEXT: addl $16, %esp
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %ecx, 4(%edx)
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: imull %eax, %ebp
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: imull %ebx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ebx, (%edi)
+; X86-NEXT: imull %ebx, %esi
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: subl %eax, %esi
-; X86-NEXT: sbbl %ecx, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: subl %eax, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -155,11 +158,23 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
; X64: # %bb.0:
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %rsi, %rax
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: je .LBB3_1
+; X64-NEXT: # %bb.2:
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: cqto
; X64-NEXT: idivq %rsi
+; X64-NEXT: jmp .LBB3_3
+; X64-NEXT: .LBB3_1:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %esi
+; X64-NEXT: # kill: def $eax killed $eax def $rax
+; X64-NEXT: .LBB3_3:
; X64-NEXT: movq %rax, (%rcx)
-; X64-NEXT: imulq %rsi, %rax
-; X64-NEXT: subq %rax, %rdi
+; X64-NEXT: imulq %rax, %rsi
+; X64-NEXT: subq %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
%div = sdiv i64 %x, %y
@@ -200,27 +215,26 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 40(%ebp), %ecx
-; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: movl 40(%ebp), %ebx
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: sarl $31, %edx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: xorl %edx, %ebx
; X86-NEXT: movl 36(%ebp), %ecx
; X86-NEXT: xorl %edx, %ecx
-; X86-NEXT: movl 32(%ebp), %ebx
-; X86-NEXT: xorl %edx, %ebx
-; X86-NEXT: movl 28(%ebp), %edi
+; X86-NEXT: movl 32(%ebp), %edi
; X86-NEXT: xorl %edx, %edi
-; X86-NEXT: subl %edx, %edi
-; X86-NEXT: sbbl %edx, %ebx
+; X86-NEXT: movl 28(%ebp), %esi
+; X86-NEXT: xorl %edx, %esi
+; X86-NEXT: subl %edx, %esi
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %edx, %esi
+; X86-NEXT: sbbl %edx, %ebx
; X86-NEXT: xorl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: sete %cl
@@ -232,90 +246,91 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %ecx
+; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %ebx, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: bsrl %esi, %eax
+; X86-NEXT: xorl $31, %eax
+; X86-NEXT: orl $32, %eax
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: bsrl %edi, %edi
-; X86-NEXT: xorl $31, %edi
-; X86-NEXT: orl $32, %edi
+; X86-NEXT: testl %edi, %edi
+; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: orl $64, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnel %edx, %edi
-; X86-NEXT: orl $64, %edi
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %edx
-; X86-NEXT: cmovnel %ecx, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: bsrl %eax, %edx
+; X86-NEXT: orl %ebx, %edx
+; X86-NEXT: cmovnel %ecx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: bsrl %ebx, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %eax, %eax
+; X86-NEXT: testl %ebx, %ebx
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: bsrl %ebx, %esi
-; X86-NEXT: xorl $31, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: bsrl %esi, %edi
+; X86-NEXT: xorl $31, %edi
; X86-NEXT: bsrl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: xorl $31, %edx
; X86-NEXT: orl $32, %edx
-; X86-NEXT: testl %ebx, %ebx
-; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: testl %esi, %esi
+; X86-NEXT: cmovnel %edi, %edx
; X86-NEXT: orl $64, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: orl %eax, %esi
+; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
-; X86-NEXT: xorl %ebx, %ebx
-; X86-NEXT: subl %edx, %edi
+; X86-NEXT: subl %edx, %eax
; X86-NEXT: movl $0, %edx
-; X86-NEXT: sbbl %edx, %edx
+; X86-NEXT: movl $0, %ebx
+; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
-; X86-NEXT: movl $0, %eax
-; X86-NEXT: sbbl %eax, %eax
; X86-NEXT: movl $127, %ecx
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %edi, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %eax, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: movl $0, %ecx
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %esi, %ecx
-; X86-NEXT: movl $0, %ecx
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: cmovnel %edx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %eax
+; X86-NEXT: cmovnel %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: cmovnel %ebx, %eax
-; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: cmovnel %edx, %eax
+; X86-NEXT: cmovel {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: jne .LBB4_1
; X86-NEXT: # %bb.8: # %_udiv-special-cases
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: xorl $127, %eax
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: xorl $127, %edx
+; X86-NEXT: orl %edi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %edx, %eax
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl %ebx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: je .LBB4_9
; X86-NEXT: # %bb.5: # %udiv-bb1
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -328,80 +343,79 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorb $127, %cl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: negb %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl 152(%esp,%eax), %esi
-; X86-NEXT: movl 156(%esp,%eax), %edx
-; X86-NEXT: shldl %cl, %esi, %edx
+; X86-NEXT: movsbl %al, %edi
+; X86-NEXT: movl 152(%esp,%edi), %edx
+; X86-NEXT: movl 156(%esp,%edi), %ebx
+; X86-NEXT: shldl %cl, %edx, %ebx
+; X86-NEXT: movl 148(%esp,%edi), %eax
+; X86-NEXT: shldl %cl, %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 144(%esp,%eax), %edx
-; X86-NEXT: movl 148(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %esi
+; X86-NEXT: movl 144(%esp,%edi), %edx
; X86-NEXT: shldl %cl, %edx, %eax
; X86-NEXT: shll %cl, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl $1, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: adcl $0, %edx
; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.6:
-; X86-NEXT: xorl %ecx, %ecx
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: movl %esi, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jmp .LBB4_7
; X86-NEXT: .LBB4_1:
-; X86-NEXT: movl %ebx, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jmp .LBB4_9
; X86-NEXT: .LBB4_2: # %udiv-preheader
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: shrb $3, %al
-; X86-NEXT: andb $12, %al
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 108(%esp,%eax), %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: shrb $3, %cl
+; X86-NEXT: andb $12, %cl
+; X86-NEXT: movzbl %cl, %esi
+; X86-NEXT: movl 108(%esp,%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 104(%esp,%eax), %ebx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: shrdl %cl, %edx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 96(%esp,%eax), %esi
-; X86-NEXT: movl 100(%esp,%eax), %eax
+; X86-NEXT: movl 104(%esp,%esi), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %ecx, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: shrdl %cl, %ebx, %edi
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: shrl %cl, %edx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %eax, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 100(%esp,%esi), %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 96(%esp,%esi), %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shrl %cl, %esi
+; X86-NEXT: shrdl %cl, %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -411,146 +425,146 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: adcl $-1, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %edx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: shldl $1, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %ebx, %edx
+; X86-NEXT: shldl $1, %eax, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: shldl $1, %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: shldl $1, %ebx, %edx
-; X86-NEXT: shldl $1, %ecx, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %edi
-; X86-NEXT: orl %esi, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: shldl $1, %edi, %ecx
-; X86-NEXT: orl %esi, %ecx
+; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: orl %edx, %esi
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: shldl $1, %esi, %eax
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %edi, %edi
-; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: cmpl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: orl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %esi, %esi
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: cmpl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: sbbl %ebx, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: sbbl %eax, %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: sarl $31, %ecx
-; X86-NEXT: movl %ecx, %esi
-; X86-NEXT: andl $1, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl $1, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ecx, %esi
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, %edx
+; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: andl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: subl %ecx, %ebx
+; X86-NEXT: subl %ecx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %eax, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: sbbl %eax, %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: sbbl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: sbbl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: sbbl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %eax
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: adcl $-1, %ebx
; X86-NEXT: adcl $-1, %edi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %edi, %ecx
; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: .LBB4_7: # %udiv-loop-exit
-; X86-NEXT: shldl $1, %ebx, %esi
-; X86-NEXT: orl %edx, %esi
-; X86-NEXT: shldl $1, %eax, %ebx
-; X86-NEXT: orl %edx, %ebx
-; X86-NEXT: shldl $1, %edi, %eax
-; X86-NEXT: orl %edx, %eax
-; X86-NEXT: movl %edi, %edx
-; X86-NEXT: addl %edi, %edx
-; X86-NEXT: orl %ecx, %edx
+; X86-NEXT: shldl $1, %ecx, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: orl %edi, %ebx
+; X86-NEXT: shldl $1, %eax, %ecx
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: orl %edi, %ecx
+; X86-NEXT: orl %edi, %eax
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: .LBB4_9: # %udiv-end
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: xorl %ecx, %esi
-; X86-NEXT: xorl %ecx, %ebx
-; X86-NEXT: xorl %ecx, %eax
-; X86-NEXT: xorl %ecx, %edx
-; X86-NEXT: subl %ecx, %edx
-; X86-NEXT: sbbl %ecx, %eax
-; X86-NEXT: sbbl %ecx, %ebx
-; X86-NEXT: sbbl %ecx, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 44(%ebp), %ecx
-; X86-NEXT: movl %edx, (%ecx)
-; X86-NEXT: movl %eax, 4(%ecx)
-; X86-NEXT: movl %ebx, 8(%ecx)
-; X86-NEXT: movl %esi, 12(%ecx)
+; X86-NEXT: xorl %esi, %ebx
+; X86-NEXT: xorl %esi, %ecx
+; X86-NEXT: xorl %esi, %eax
+; X86-NEXT: xorl %esi, %edx
+; X86-NEXT: subl %esi, %edx
+; X86-NEXT: sbbl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 28(%ebp), %ecx
-; X86-NEXT: movl %ebx, %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %ecx
+; X86-NEXT: sbbl %esi, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbl %esi, %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 28(%ebp), %edi
; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %esi, %edi
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl 32(%ebp), %esi
; X86-NEXT: mull %esi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edx
+; X86-NEXT: adcl %edi, %edx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %esi
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %ebx, 12(%eax)
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: mull %edi
@@ -558,32 +572,37 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: imull %esi, %edi
; X86-NEXT: addl %edx, %edi
; X86-NEXT: addl %ebx, %edi
-; X86-NEXT: movl 36(%ebp), %eax
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 40(%ebp), %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: imull %edx, %ebx
-; X86-NEXT: mull %edx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, (%ecx)
+; X86-NEXT: movl 40(%ebp), %esi
+; X86-NEXT: imull %eax, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl 36(%ebp), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ecx
+; X86-NEXT: addl %edx, %esi
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: adcl %edi, %ebx
-; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl 12(%ebp), %edx
; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl 16(%ebp), %ecx
; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl 20(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 24(%ebp), %esi
-; X86-NEXT: sbbl %ebx, %esi
+; X86-NEXT: movl 20(%ebp), %ebx
+; X86-NEXT: sbbl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl 44(%ebp), %edi
+; X86-NEXT: movl %eax, 4(%edi)
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: sbbl %esi, %edi
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl %edx, (%eax)
; X86-NEXT: movl %ecx, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %esi, 12(%eax)
+; X86-NEXT: movl %ebx, 8(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -680,17 +699,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm5
+; X86-NEXT: movzbl %al, %ecx
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
+; X86-NEXT: movd %ecx, %xmm5
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movd %eax, %xmm6
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %esi
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
@@ -701,18 +717,22 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movsbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: idivb {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movsbl (%esp), %eax
; X86-NEXT: idivb {{[0-9]+}}(%esp)
+; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-NEXT: movd %edx, %xmm7
+; X86-NEXT: movd %esi, %xmm7
; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-NEXT: movd %esi, %xmm4
+; X86-NEXT: movd %edi, %xmm4
; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; X86-NEXT: movd %edi, %xmm2
+; X86-NEXT: movd %ebx, %xmm2
; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; X86-NEXT: movd %ebx, %xmm5
-; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: movd %ecx, %xmm5
+; X86-NEXT: movzbl %dl, %ecx
; X86-NEXT: movd %ecx, %xmm6
; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
@@ -756,74 +776,79 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: movzbl %al, %r11d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movzbl %al, %r9d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: movzbl %al, %r10d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r8d
+; X64-NEXT: movzbl %al, %ebx
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r9d
+; X64-NEXT: movzbl %al, %ebp
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: movzbl %al, %r14d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r11d
+; X64-NEXT: movzbl %al, %r15d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movzbl %al, %r12d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebp
+; X64-NEXT: movl %eax, %edx
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r14d
+; X64-NEXT: movl %eax, %ecx
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r15d
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movzbl %dl, %r13d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r12d
+; X64-NEXT: movl %eax, %edx
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r13d
+; X64-NEXT: movl %eax, %r8d
+; X64-NEXT: movzbl %cl, %edi
+; X64-NEXT: movzbl %sil, %esi
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edx
+; X64-NEXT: movd %r11d, %xmm2
+; X64-NEXT: movzbl %dl, %r11d
+; X64-NEXT: movzbl %r8b, %edx
+; X64-NEXT: movzbl %al, %r8d
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: movsbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: idivb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movd %edi, %xmm3
-; X64-NEXT: movd %esi, %xmm4
-; X64-NEXT: movd %r8d, %xmm5
-; X64-NEXT: movd %r9d, %xmm6
+; X64-NEXT: movd %r9d, %xmm3
+; X64-NEXT: movd %r10d, %xmm4
+; X64-NEXT: movd %ebx, %xmm5
+; X64-NEXT: movd %ebp, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movd %r10d, %xmm7
+; X64-NEXT: movd %r14d, %xmm7
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; X64-NEXT: movd %r11d, %xmm4
+; X64-NEXT: movd %r15d, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X64-NEXT: movd %ebx, %xmm2
+; X64-NEXT: movd %r12d, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; X64-NEXT: movd %ebp, %xmm3
+; X64-NEXT: movd %r13d, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X64-NEXT: movd %r14d, %xmm4
+; X64-NEXT: movd %edi, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; X64-NEXT: movd %r15d, %xmm6
+; X64-NEXT: movd %esi, %xmm6
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; X64-NEXT: movd %r12d, %xmm5
+; X64-NEXT: movd %r11d, %xmm5
; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; X64-NEXT: movd %r13d, %xmm3
+; X64-NEXT: movd %edx, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X64-NEXT: movd %edx, %xmm6
+; X64-NEXT: movd %r8d, %xmm6
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: movd %ecx, %xmm4
@@ -866,114 +891,116 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind {
; X86-LABEL: vector_i128_i16:
; X86: # %bb.0:
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: pextrw $7, %xmm0, %eax
-; X86-NEXT: pextrw $7, %xmm1, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: pextrw $7, %xmm1, %edi
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: pextrw $6, %xmm0, %ecx
+; X86-NEXT: idivw %di
; X86-NEXT: # kill: def $ax killed $ax def $eax
+; X86-NEXT: pextrw $6, %xmm1, %edi
; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: pextrw $6, %xmm0, %eax
-; X86-NEXT: pextrw $6, %xmm1, %esi
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: idivw %di
; X86-NEXT: # kill: def $ax killed $ax def $eax
+; X86-NEXT: pextrw $5, %xmm0, %ecx
; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X86-NEXT: pextrw $5, %xmm0, %eax
-; X86-NEXT: pextrw $5, %xmm1, %esi
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: pextrw $5, %xmm1, %edi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: pextrw $4, %xmm0, %ecx
+; X86-NEXT: idivw %di
; X86-NEXT: # kill: def $ax killed $ax def $eax
+; X86-NEXT: pextrw $4, %xmm1, %edi
; X86-NEXT: movd %eax, %xmm4
-; X86-NEXT: pextrw $4, %xmm0, %eax
-; X86-NEXT: pextrw $4, %xmm1, %esi
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X86-NEXT: idivw %di
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm2
-; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-NEXT: pextrw $3, %xmm0, %eax
-; X86-NEXT: pextrw $3, %xmm1, %esi
+; X86-NEXT: pextrw $3, %xmm1, %ecx
+; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
+; X86-NEXT: idivw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: pextrw $2, %xmm0, %eax
-; X86-NEXT: pextrw $2, %xmm1, %esi
+; X86-NEXT: pextrw $2, %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: idivw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X86-NEXT: pextrw $1, %xmm0, %eax
-; X86-NEXT: pextrw $1, %xmm1, %esi
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
+; X86-NEXT: pextrw $1, %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: idivw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: movd %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: cwtd
-; X86-NEXT: idivw %si
+; X86-NEXT: idivw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm5
; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
-; X86-NEXT: movdqa %xmm5, (%ecx)
+; X86-NEXT: movdqa %xmm5, (%esi)
; X86-NEXT: pmullw %xmm1, %xmm5
; X86-NEXT: psubw %xmm5, %xmm0
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: vector_i128_i16:
; X64: # %bb.0:
; X64-NEXT: pextrw $7, %xmm0, %eax
-; X64-NEXT: pextrw $7, %xmm1, %ecx
+; X64-NEXT: pextrw $7, %xmm1, %esi
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: cwtd
-; X64-NEXT: idivw %cx
+; X64-NEXT: pextrw $6, %xmm0, %ecx
+; X64-NEXT: idivw %si
; X64-NEXT: # kill: def $ax killed $ax def $eax
+; X64-NEXT: pextrw $6, %xmm1, %esi
; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: pextrw $6, %xmm0, %eax
-; X64-NEXT: pextrw $6, %xmm1, %ecx
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cwtd
-; X64-NEXT: idivw %cx
+; X64-NEXT: idivw %si
; X64-NEXT: # kill: def $ax killed $ax def $eax
+; X64-NEXT: pextrw $5, %xmm0, %ecx
; X64-NEXT: movd %eax, %xmm3
-; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
-; X64-NEXT: pextrw $5, %xmm0, %eax
-; X64-NEXT: pextrw $5, %xmm1, %ecx
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: pextrw $5, %xmm1, %esi
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cwtd
-; X64-NEXT: idivw %cx
+; X64-NEXT: pextrw $4, %xmm0, %ecx
+; X64-NEXT: idivw %si
; X64-NEXT: # kill: def $ax killed $ax def $eax
+; X64-NEXT: pextrw $4, %xmm1, %esi
; X64-NEXT: movd %eax, %xmm4
-; X64-NEXT: pextrw $4, %xmm0, %eax
-; X64-NEXT: pextrw $4, %xmm1, %ecx
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: cwtd
-; X64-NEXT: idivw %cx
+; X64-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
+; X64-NEXT: idivw %si
; X64-NEXT: # kill: def $ax killed $ax def $eax
; X64-NEXT: movd %eax, %xmm2
-; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
-; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: pextrw $3, %xmm0, %eax
; X64-NEXT: pextrw $3, %xmm1, %ecx
+; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: cwtd
+; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X64-NEXT: idivw %cx
; X64-NEXT: # kill: def $ax killed $ax def $eax
; X64-NEXT: movd %eax, %xmm3
@@ -984,8 +1011,8 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
; X64-NEXT: idivw %cx
; X64-NEXT: # kill: def $ax killed $ax def $eax
; X64-NEXT: movd %eax, %xmm4
-; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; X64-NEXT: pextrw $1, %xmm0, %eax
+; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; X64-NEXT: pextrw $1, %xmm1, %ecx
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: cwtd
@@ -1112,50 +1139,41 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: subl $64, %esp
-; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-NEXT: movd %xmm1, (%esp)
+; X86-NEXT: movaps %xmm0, %xmm2
+; X86-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X86-NEXT: movups %xmm2, (%esp)
+; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __divdi3
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, (%esp)
+; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: movups %xmm0, (%esp)
; X86-NEXT: movd %edx, %xmm0
; X86-NEXT: movd %eax, %xmm1
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __divdi3
; X86-NEXT: movd %edx, %xmm1
-; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; X86-NEXT: movdqa %xmm3, (%esi)
+; X86-NEXT: movd %eax, %xmm4
+; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; X86-NEXT: movdqa %xmm4, (%esi)
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
+; X86-NEXT: movdqa %xmm3, %xmm1
; X86-NEXT: psrlq $32, %xmm1
-; X86-NEXT: pmuludq %xmm3, %xmm1
-; X86-NEXT: movdqa %xmm3, %xmm2
+; X86-NEXT: pmuludq %xmm4, %xmm1
+; X86-NEXT: movdqa %xmm4, %xmm2
; X86-NEXT: psrlq $32, %xmm2
-; X86-NEXT: pmuludq %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm3, %xmm2
; X86-NEXT: paddq %xmm1, %xmm2
; X86-NEXT: psllq $32, %xmm2
-; X86-NEXT: pmuludq %xmm0, %xmm3
-; X86-NEXT: paddq %xmm2, %xmm3
+; X86-NEXT: pmuludq %xmm3, %xmm4
+; X86-NEXT: paddq %xmm2, %xmm4
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: psubq %xmm3, %xmm0
+; X86-NEXT: psubq %xmm4, %xmm0
; X86-NEXT: addl $64, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
index 6fdde0b14a9843..c3ece6457c4801 100644
--- a/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
+++ b/llvm/test/CodeGen/X86/div-rem-pair-recomposition-unsigned.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=generic -mattr=+sse,+sse2 | FileCheck %s --check-prefix=X64
; If the target does not have a single div/rem operation,
; -div-rem-pairs pass will decompose the remainder calculation as:
@@ -122,29 +122,32 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: pushl %ebp
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: pushl %esi
+; X86-NEXT: pushl %eax
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: pushl {{[0-9]+}}(%esp)
+; X86-NEXT: pushl %ebp
; X86-NEXT: calll __udivdi3
; X86-NEXT: addl $16, %esp
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl %ecx, 4(%edx)
-; X86-NEXT: movl %eax, (%edx)
-; X86-NEXT: imull %eax, %ebp
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %ebp, %edx
-; X86-NEXT: imull %ebx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %ebx, (%edi)
+; X86-NEXT: imull %ebx, %esi
+; X86-NEXT: addl %esi, %edx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: subl %eax, %esi
-; X86-NEXT: sbbl %ecx, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: subl %eax, %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: sbbl %ecx, %edx
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -155,11 +158,23 @@ define i64 @scalar_i64(i64 %x, i64 %y, ptr %divdst) nounwind {
; X64: # %bb.0:
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: orq %rsi, %rax
+; X64-NEXT: shrq $32, %rax
+; X64-NEXT: je .LBB3_1
+; X64-NEXT: # %bb.2:
+; X64-NEXT: movq %rdi, %rax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divq %rsi
+; X64-NEXT: jmp .LBB3_3
+; X64-NEXT: .LBB3_1:
+; X64-NEXT: movl %edi, %eax
+; X64-NEXT: xorl %edx, %edx
+; X64-NEXT: divl %esi
+; X64-NEXT: # kill: def $eax killed $eax def $rax
+; X64-NEXT: .LBB3_3:
; X64-NEXT: movq %rax, (%rcx)
-; X64-NEXT: imulq %rsi, %rax
-; X64-NEXT: subq %rax, %rdi
+; X64-NEXT: imulq %rax, %rsi
+; X64-NEXT: subq %rsi, %rdi
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: retq
%div = udiv i64 %x, %y
@@ -181,10 +196,10 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: andl $-16, %esp
; X86-NEXT: subl $160, %esp
; X86-NEXT: movl 28(%ebp), %ebx
-; X86-NEXT: movl 40(%ebp), %esi
-; X86-NEXT: movl 32(%ebp), %edi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: orl %esi, %eax
+; X86-NEXT: movl 40(%ebp), %edi
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: movl %ebx, %ecx
; X86-NEXT: orl 36(%ebp), %ecx
; X86-NEXT: orl %eax, %ecx
@@ -197,23 +212,23 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: sete %al
; X86-NEXT: orb %cl, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: bsrl %esi, %edx
+; X86-NEXT: bsrl %edi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl 36(%ebp), %ecx
; X86-NEXT: xorl $31, %ecx
; X86-NEXT: orl $32, %ecx
-; X86-NEXT: testl %esi, %esi
+; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: bsrl %edi, %edx
+; X86-NEXT: bsrl %esi, %edx
; X86-NEXT: xorl $31, %edx
; X86-NEXT: bsrl %ebx, %eax
; X86-NEXT: xorl $31, %eax
; X86-NEXT: orl $32, %eax
-; X86-NEXT: testl %edi, %edi
+; X86-NEXT: testl %esi, %esi
; X86-NEXT: cmovnel %edx, %eax
; X86-NEXT: orl $64, %eax
; X86-NEXT: movl 36(%ebp), %edx
-; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %edi, %edx
; X86-NEXT: cmovnel %ecx, %eax
; X86-NEXT: movl 24(%ebp), %ebx
; X86-NEXT: bsrl %ebx, %edx
@@ -233,15 +248,14 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: testl %edi, %edi
; X86-NEXT: cmovnel %esi, %edx
; X86-NEXT: orl $64, %edx
-; X86-NEXT: movl 20(%ebp), %edi
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: movl 20(%ebp), %esi
; X86-NEXT: orl %ebx, %esi
; X86-NEXT: cmovnel %ecx, %edx
; X86-NEXT: subl %edx, %eax
; X86-NEXT: movl $0, %edx
; X86-NEXT: sbbl %edx, %edx
-; X86-NEXT: movl $0, %ebx
-; X86-NEXT: sbbl %ebx, %ebx
+; X86-NEXT: movl $0, %edi
+; X86-NEXT: sbbl %edi, %edi
; X86-NEXT: movl $0, %esi
; X86-NEXT: sbbl %esi, %esi
; X86-NEXT: movl $127, %ecx
@@ -249,49 +263,46 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %edx, %ecx
; X86-NEXT: movl $0, %ecx
-; X86-NEXT: sbbl %ebx, %ecx
+; X86-NEXT: sbbl %edi, %ecx
; X86-NEXT: movl $0, %ecx
; X86-NEXT: sbbl %esi, %ecx
; X86-NEXT: setb %cl
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %cl # 1-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: xorl $127, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edi, %eax
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: testb %cl, %cl
-; X86-NEXT: movb %cl, %ah
-; X86-NEXT: movl 24(%ebp), %ebx
-; X86-NEXT: movl $0, %esi
-; X86-NEXT: cmovnel %esi, %ebx
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: cmovnel %esi, %ecx
; X86-NEXT: movl $0, %edx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 16(%ebp), %esi
-; X86-NEXT: cmovnel %edx, %esi
-; X86-NEXT: movl 12(%ebp), %edi
-; X86-NEXT: movl %edi, %ecx
-; X86-NEXT: cmovnel %edx, %ecx
-; X86-NEXT: orb %ah, %al
-; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: cmovnel %edx, %ebx
+; X86-NEXT: movl 20(%ebp), %edi
+; X86-NEXT: cmovnel %edx, %edi
+; X86-NEXT: movl 16(%ebp), %edx
+; X86-NEXT: movl $0, %esi
+; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 12(%ebp), %edx
+; X86-NEXT: cmovnel %esi, %edx
+; X86-NEXT: orb %cl, %al
+; X86-NEXT: movl 20(%ebp), %ecx
; X86-NEXT: jne .LBB4_7
; X86-NEXT: # %bb.1: # %udiv-bb1
-; X86-NEXT: movl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %eax
+; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
; X86-NEXT: xorps %xmm0, %xmm0
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: movl 16(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 20(%ebp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: movl 24(%ebp), %eax
; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: xorb $127, %cl
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
@@ -299,111 +310,110 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: negb %al
; X86-NEXT: movsbl %al, %eax
; X86-NEXT: movl 136(%esp,%eax), %edi
-; X86-NEXT: movl 140(%esp,%eax), %esi
-; X86-NEXT: shldl %cl, %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 128(%esp,%eax), %ebx
-; X86-NEXT: movl 132(%esp,%eax), %eax
-; X86-NEXT: shldl %cl, %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: shldl %cl, %ebx, %edi
-; X86-NEXT: shll %cl, %ebx
-; X86-NEXT: movl %ebx, %ecx
-; X86-NEXT: addl $1, %edx
+; X86-NEXT: movl 140(%esp,%eax), %edx
+; X86-NEXT: shldl %cl, %edi, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl $0, %eax
+; X86-NEXT: movl 132(%esp,%eax), %ebx
+; X86-NEXT: shldl %cl, %ebx, %edi
+; X86-NEXT: movl 128(%esp,%eax), %edx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: shldl %cl, %edx, %eax
+; X86-NEXT: shll %cl, %edx
+; X86-NEXT: addl $1, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: adcl $0, %esi
-; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl 20(%ebp), %ebx
; X86-NEXT: jae .LBB4_2
; X86-NEXT: # %bb.5:
-; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: xorl %eax, %eax
-; X86-NEXT: movl %edi, %esi
+; X86-NEXT: xorl %esi, %esi
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: jmp .LBB4_6
; X86-NEXT: .LBB4_2: # %udiv-preheader
; X86-NEXT: movaps %xmm0, {{[0-9]+}}(%esp)
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 12(%ebp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl 16(%ebp), %edx
-; X86-NEXT: movl %edx, {{[0-9]+}}(%esp)
-; X86-NEXT: movl %ebx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 12(%ebp), %esi
+; X86-NEXT: movl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 20(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
+; X86-NEXT: movl 24(%ebp), %ecx
+; X86-NEXT: movl %ecx, {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 24(%ebp), %eax
-; X86-NEXT: movl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: # kill: def $al killed $al killed $eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: shrb $3, %al
; X86-NEXT: andb $12, %al
; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movl 92(%esp,%eax), %esi
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 92(%esp,%eax), %ebx
+; X86-NEXT: movl 88(%esp,%eax), %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 88(%esp,%eax), %edx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
-; X86-NEXT: shrdl %cl, %esi, %ebx
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 80(%esp,%eax), %edi
-; X86-NEXT: movl 84(%esp,%eax), %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shrdl %cl, %edx, %esi
+; X86-NEXT: shrdl %cl, %ebx, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shrl %cl, %edx
-; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl 84(%esp,%eax), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: shrdl %cl, %eax, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl 80(%esp,%eax), %eax
+; X86-NEXT: shrl %cl, %ebx
+; X86-NEXT: movl %ebx, %edi
+; X86-NEXT: # kill: def $cl killed $cl killed $ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: shrdl %cl, %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: addl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 32(%ebp), %eax
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl 36(%ebp), %esi
-; X86-NEXT: adcl $-1, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: adcl $-1, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 40(%ebp), %eax
; X86-NEXT: adcl $-1, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: xorl %eax, %eax
+; X86-NEXT: xorl %ecx, %ecx
; X86-NEXT: movl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: .p2align 4
; X86-NEXT: .LBB4_3: # %udiv-do-while
; X86-NEXT: # =>This Inner Loop Header: Depth=1
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: shldl $1, %esi, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %esi, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl $1, %edi, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: shldl $1, %ebx, %edi
+; X86-NEXT: shldl $1, %eax, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ebx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %edx
-; X86-NEXT: orl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: shldl $1, %edx, %ecx
-; X86-NEXT: orl %eax, %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: shldl $1, %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: shldl $1, %ecx, %edx
-; X86-NEXT: orl %eax, %edx
+; X86-NEXT: shldl $1, %ecx, %eax
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: addl %ecx, %ecx
; X86-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -428,115 +438,125 @@ define i128 @scalar_i128(i128 %x, i128 %y, ptr %divdst) nounwind {
; X86-NEXT: andl 28(%ebp), %ecx
; X86-NEXT: subl %ecx, %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: sbbl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl $-1, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl $-1, %eax
-; X86-NEXT: adcl $-1, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl $-1, %edx
; X86-NEXT: adcl $-1, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: orl %esi, %eax
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: orl %ebx, %ecx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: orl %edx, %ecx
; X86-NEXT: orl %eax, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: jne .LBB4_3
; X86-NEXT: # %bb.4:
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: .LBB4_6: # %udiv-loop-exit
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: shldl $1, %edi, %ebx
-; X86-NEXT: orl %eax, %ebx
-; X86-NEXT: shldl $1, %esi, %edi
-; X86-NEXT: orl %eax, %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: shldl $1, %ecx, %esi
-; X86-NEXT: orl %eax, %esi
-; X86-NEXT: addl %ecx, %ecx
-; X86-NEXT: orl %edx, %ecx
-; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: orl %ecx, %ebx
+; X86-NEXT: shldl $1, %eax, %edi
+; X86-NEXT: shldl $1, %edx, %eax
+; X86-NEXT: orl %ecx, %edi
+; X86-NEXT: orl %ecx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %edx, %edx
+; X86-NEXT: orl %esi, %edx
; X86-NEXT: .LBB4_7: # %udiv-end
-; X86-NEXT: movl %ecx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %ebx, 12(%eax)
-; X86-NEXT: movl %esi, %edx
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 36(%ebp), %eax
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: imull %edx, %esi
-; X86-NEXT: mull %ecx
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: mull %edx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %esi, %edx
-; X86-NEXT: movl 40(%ebp), %edi
-; X86-NEXT: imull %ecx, %edi
-; X86-NEXT: addl %edx, %edi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl 40(%ebp), %eax
+; X86-NEXT: imull %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl 28(%ebp), %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl 44(%ebp), %ecx
+; X86-NEXT: movl %ebx, 12(%ecx)
; X86-NEXT: imull 28(%ebp), %ebx
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl 32(%ebp), %edx
-; X86-NEXT: imull %edx, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %esi, 8(%ecx)
+; X86-NEXT: movl 32(%ebp), %esi
+; X86-NEXT: imull %esi, %eax
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %edx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl 28(%ebp), %ecx
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ecx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: movl 16(%ebp), %esi
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: setb %cl
+; X86-NEXT: adcl %edi, %edx
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebx, %edi
; X86-NEXT: mull 32(%ebp)
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: movl 44(%ebp), %ebx
+; X86-NEXT: movl %esi, (%ebx)
+; X86-NEXT: addl %ecx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl 12(%ebp), %ebx
; X86-NEXT: subl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-NEXT: movl 20(%ebp), %edi
-; X86-NEXT: sbbl %eax, %edi
-; X86-NEXT: movl 24(%ebp), %ecx
-; X86-NEXT: sbbl %edx, %ecx
+; X86-NEXT: movl 16(%ebp), %ecx
+; X86-NEXT: sbbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl 20(%ebp), %esi
+; X86-NEXT: sbbl %eax, %esi
+; X86-NEXT: movl 44(%ebp), %eax
+; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl 24(%ebp), %edi
+; X86-NEXT: sbbl %edx, %edi
; X86-NEXT: movl 8(%ebp), %eax
; X86-NEXT: movl %ebx, (%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, 8(%eax)
-; X86-NEXT: movl %ecx, 12(%eax)
+; X86-NEXT: movl %ecx, 4(%eax)
+; X86-NEXT: movl %esi, 8(%eax)
+; X86-NEXT: movl %edi, 12(%eax)
; X86-NEXT: leal -12(%ebp), %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -633,17 +653,14 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %eax
-; X86-NEXT: movd %eax, %xmm5
+; X86-NEXT: movzbl %al, %ecx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
+; X86-NEXT: movd %ecx, %xmm5
; X86-NEXT: movzbl %al, %eax
; X86-NEXT: movd %eax, %xmm6
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
-; X86-NEXT: movzbl %al, %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movzbl %al, %esi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
@@ -654,18 +671,22 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: divb {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %edx
; X86-NEXT: movzbl (%esp), %eax
; X86-NEXT: divb {{[0-9]+}}(%esp)
+; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
-; X86-NEXT: movd %edx, %xmm7
+; X86-NEXT: movd %esi, %xmm7
; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1]
-; X86-NEXT: movd %esi, %xmm4
+; X86-NEXT: movd %edi, %xmm4
; X86-NEXT: punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; X86-NEXT: movd %edi, %xmm2
+; X86-NEXT: movd %ebx, %xmm2
; X86-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm7[0],xmm4[1],xmm7[1],xmm4[2],xmm7[2],xmm4[3],xmm7[3],xmm4[4],xmm7[4],xmm4[5],xmm7[5],xmm4[6],xmm7[6],xmm4[7],xmm7[7]
; X86-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm6[0],xmm4[1],xmm6[1],xmm4[2],xmm6[2],xmm4[3],xmm6[3]
-; X86-NEXT: movd %ebx, %xmm5
-; X86-NEXT: movzbl %cl, %ecx
+; X86-NEXT: movd %ecx, %xmm5
+; X86-NEXT: movzbl %dl, %ecx
; X86-NEXT: movd %ecx, %xmm6
; X86-NEXT: movl 8(%ebp), %ecx
; X86-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm2[0],xmm5[1],xmm2[1],xmm5[2],xmm2[2],xmm5[3],xmm2[3],xmm5[4],xmm2[4],xmm5[5],xmm2[5],xmm5[6],xmm2[6],xmm5[7],xmm2[7]
@@ -709,74 +730,79 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
; X64-NEXT: movdqa %xmm1, -{{[0-9]+}}(%rsp)
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %eax
-; X64-NEXT: movd %eax, %xmm2
+; X64-NEXT: movzbl %al, %r11d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edi
+; X64-NEXT: movzbl %al, %r9d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %esi
+; X64-NEXT: movzbl %al, %r10d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r8d
+; X64-NEXT: movzbl %al, %ebx
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r9d
+; X64-NEXT: movzbl %al, %ebp
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r10d
+; X64-NEXT: movzbl %al, %r14d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r11d
+; X64-NEXT: movzbl %al, %r15d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebx
+; X64-NEXT: movzbl %al, %r12d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %ebp
+; X64-NEXT: movl %eax, %edx
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r14d
+; X64-NEXT: movl %eax, %ecx
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r15d
+; X64-NEXT: movl %eax, %esi
+; X64-NEXT: movzbl %dl, %r13d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r12d
+; X64-NEXT: movl %eax, %edx
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %r13d
+; X64-NEXT: movl %eax, %r8d
+; X64-NEXT: movzbl %cl, %edi
+; X64-NEXT: movzbl %sil, %esi
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movzbl %al, %edx
+; X64-NEXT: movd %r11d, %xmm2
+; X64-NEXT: movzbl %dl, %r11d
+; X64-NEXT: movzbl %r8b, %edx
+; X64-NEXT: movzbl %al, %r8d
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
; X64-NEXT: movl %eax, %ecx
; X64-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
; X64-NEXT: divb -{{[0-9]+}}(%rsp)
-; X64-NEXT: movd %edi, %xmm3
-; X64-NEXT: movd %esi, %xmm4
-; X64-NEXT: movd %r8d, %xmm5
-; X64-NEXT: movd %r9d, %xmm6
+; X64-NEXT: movd %r9d, %xmm3
+; X64-NEXT: movd %r10d, %xmm4
+; X64-NEXT: movd %ebx, %xmm5
+; X64-NEXT: movd %ebp, %xmm6
; X64-NEXT: punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT: movd %r10d, %xmm7
+; X64-NEXT: movd %r14d, %xmm7
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; X64-NEXT: movd %r11d, %xmm4
+; X64-NEXT: movd %r15d, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1],xmm5[2],xmm3[2],xmm5[3],xmm3[3]
-; X64-NEXT: movd %ebx, %xmm2
+; X64-NEXT: movd %r12d, %xmm2
; X64-NEXT: punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; X64-NEXT: movd %ebp, %xmm3
+; X64-NEXT: movd %r13d, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3],xmm2[4],xmm4[4],xmm2[5],xmm4[5],xmm2[6],xmm4[6],xmm2[7],xmm4[7]
-; X64-NEXT: movd %r14d, %xmm4
+; X64-NEXT: movd %edi, %xmm4
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm7[0],xmm2[1],xmm7[1],xmm2[2],xmm7[2],xmm2[3],xmm7[3]
-; X64-NEXT: movd %r15d, %xmm6
+; X64-NEXT: movd %esi, %xmm6
; X64-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm5[0],xmm2[1],xmm5[1]
-; X64-NEXT: movd %r12d, %xmm5
+; X64-NEXT: movd %r11d, %xmm5
; X64-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; X64-NEXT: movd %r13d, %xmm3
+; X64-NEXT: movd %edx, %xmm3
; X64-NEXT: punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm6[0],xmm5[1],xmm6[1],xmm5[2],xmm6[2],xmm5[3],xmm6[3],xmm5[4],xmm6[4],xmm5[5],xmm6[5],xmm5[6],xmm6[6],xmm5[7],xmm6[7]
-; X64-NEXT: movd %edx, %xmm6
+; X64-NEXT: movd %r8d, %xmm6
; X64-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: movd %ecx, %xmm4
@@ -819,75 +845,77 @@ define <16 x i8> @vector_i128_i8(<16 x i8> %x, <16 x i8> %y, ptr %divdst) nounwi
define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounwind {
; X86-LABEL: vector_i128_i16:
; X86: # %bb.0:
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: pextrw $7, %xmm0, %eax
-; X86-NEXT: pextrw $7, %xmm1, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: pextrw $7, %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm2
; X86-NEXT: pextrw $6, %xmm0, %eax
-; X86-NEXT: pextrw $6, %xmm1, %esi
+; X86-NEXT: pextrw $6, %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm3
; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3]
; X86-NEXT: pextrw $5, %xmm0, %eax
-; X86-NEXT: pextrw $5, %xmm1, %esi
+; X86-NEXT: pextrw $5, %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
+; X86-NEXT: pextrw $4, %xmm0, %ecx
; X86-NEXT: movd %eax, %xmm4
-; X86-NEXT: pextrw $4, %xmm0, %eax
-; X86-NEXT: pextrw $4, %xmm1, %esi
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: pextrw $4, %xmm1, %edi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %di
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm2
; X86-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
; X86-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],xmm3[0],xmm2[1],xmm3[1]
; X86-NEXT: pextrw $3, %xmm0, %eax
-; X86-NEXT: pextrw $3, %xmm1, %esi
+; X86-NEXT: pextrw $3, %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
+; X86-NEXT: pextrw $2, %xmm0, %ecx
; X86-NEXT: movd %eax, %xmm4
-; X86-NEXT: pextrw $2, %xmm0, %eax
-; X86-NEXT: pextrw $2, %xmm1, %esi
-; X86-NEXT: # kill: def $ax killed $ax killed $eax
+; X86-NEXT: pextrw $2, %xmm1, %edi
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %di
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X86-NEXT: pextrw $1, %xmm0, %eax
-; X86-NEXT: pextrw $1, %xmm1, %esi
+; X86-NEXT: pextrw $1, %xmm1, %ecx
+; X86-NEXT: punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm4[0],xmm3[1],xmm4[1],xmm3[2],xmm4[2],xmm3[3],xmm4[3]
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm4
; X86-NEXT: movd %xmm0, %eax
-; X86-NEXT: movd %xmm1, %esi
+; X86-NEXT: movd %xmm1, %ecx
; X86-NEXT: # kill: def $ax killed $ax killed $eax
; X86-NEXT: xorl %edx, %edx
-; X86-NEXT: divw %si
+; X86-NEXT: divw %cx
; X86-NEXT: # kill: def $ax killed $ax def $eax
; X86-NEXT: movd %eax, %xmm5
; X86-NEXT: punpcklwd {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3]
; X86-NEXT: punpckldq {{.*#+}} xmm5 = xmm5[0],xmm3[0],xmm5[1],xmm3[1]
; X86-NEXT: punpcklqdq {{.*#+}} xmm5 = xmm5[0],xmm2[0]
-; X86-NEXT: movdqa %xmm5, (%ecx)
+; X86-NEXT: movdqa %xmm5, (%esi)
; X86-NEXT: pmullw %xmm1, %xmm5
; X86-NEXT: psubw %xmm5, %xmm0
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
; X86-NEXT: retl
;
; X64-LABEL: vector_i128_i16:
@@ -913,12 +941,12 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divw %cx
; X64-NEXT: # kill: def $ax killed $ax def $eax
+; X64-NEXT: pextrw $4, %xmm0, %ecx
; X64-NEXT: movd %eax, %xmm4
-; X64-NEXT: pextrw $4, %xmm0, %eax
-; X64-NEXT: pextrw $4, %xmm1, %ecx
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: pextrw $4, %xmm1, %esi
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divw %cx
+; X64-NEXT: divw %si
; X64-NEXT: # kill: def $ax killed $ax def $eax
; X64-NEXT: movd %eax, %xmm2
; X64-NEXT: punpcklwd {{.*#+}} xmm2 = xmm2[0],xmm4[0],xmm2[1],xmm4[1],xmm2[2],xmm4[2],xmm2[3],xmm4[3]
@@ -929,17 +957,17 @@ define <8 x i16> @vector_i128_i16(<8 x i16> %x, <8 x i16> %y, ptr %divdst) nounw
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divw %cx
; X64-NEXT: # kill: def $ax killed $ax def $eax
+; X64-NEXT: pextrw $2, %xmm0, %ecx
; X64-NEXT: movd %eax, %xmm3
-; X64-NEXT: pextrw $2, %xmm0, %eax
-; X64-NEXT: pextrw $2, %xmm1, %ecx
-; X64-NEXT: # kill: def $ax killed $ax killed $eax
+; X64-NEXT: pextrw $2, %xmm1, %esi
+; X64-NEXT: movl %ecx, %eax
; X64-NEXT: xorl %edx, %edx
-; X64-NEXT: divw %cx
+; X64-NEXT: divw %si
; X64-NEXT: # kill: def $ax killed $ax def $eax
; X64-NEXT: movd %eax, %xmm4
-; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; X64-NEXT: pextrw $1, %xmm0, %eax
; X64-NEXT: pextrw $1, %xmm1, %ecx
+; X64-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3]
; X64-NEXT: # kill: def $ax killed $ax killed $eax
; X64-NEXT: xorl %edx, %edx
; X64-NEXT: divw %cx
@@ -1065,50 +1093,41 @@ define <2 x i64> @vector_i128_i64(<2 x i64> %x, <2 x i64> %y, ptr %divdst) nounw
; X86: # %bb.0:
; X86-NEXT: pushl %esi
; X86-NEXT: subl $64, %esp
-; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
-; X86-NEXT: movdqu %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movups %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[3,3,3,3]
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
-; X86-NEXT: movd %xmm2, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[3,3,3,3]
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: pshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
-; X86-NEXT: movd %xmm1, (%esp)
+; X86-NEXT: movaps %xmm0, %xmm2
+; X86-NEXT: unpckhpd {{.*#+}} xmm2 = xmm2[1],xmm1[1]
+; X86-NEXT: movups %xmm2, (%esp)
+; X86-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; X86-NEXT: movups %xmm0, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __udivdi3
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, {{[0-9]+}}(%esp)
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm1 # 16-byte Reload
-; X86-NEXT: pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
-; X86-NEXT: movd %xmm0, {{[0-9]+}}(%esp)
-; X86-NEXT: movd %xmm1, (%esp)
+; X86-NEXT: movups {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
+; X86-NEXT: movups %xmm0, (%esp)
; X86-NEXT: movd %edx, %xmm0
; X86-NEXT: movd %eax, %xmm1
; X86-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
; X86-NEXT: movdqu %xmm1, {{[-0-9]+}}(%e{{[sb]}}p) # 16-byte Spill
; X86-NEXT: calll __udivdi3
; X86-NEXT: movd %edx, %xmm1
-; X86-NEXT: movd %eax, %xmm3
-; X86-NEXT: punpckldq {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1]
-; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: punpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm0[0]
-; X86-NEXT: movdqa %xmm3, (%esi)
+; X86-NEXT: movd %eax, %xmm4
+; X86-NEXT: punpckldq {{.*#+}} xmm4 = xmm4[0],xmm1[0],xmm4[1],xmm1[1]
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: movdqa %xmm0, %xmm1
+; X86-NEXT: punpcklqdq {{.*#+}} xmm4 = xmm4[0],xmm0[0]
+; X86-NEXT: movdqa %xmm4, (%esi)
+; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm3 # 16-byte Reload
+; X86-NEXT: movdqa %xmm3, %xmm1
; X86-NEXT: psrlq $32, %xmm1
-; X86-NEXT: pmuludq %xmm3, %xmm1
-; X86-NEXT: movdqa %xmm3, %xmm2
+; X86-NEXT: pmuludq %xmm4, %xmm1
+; X86-NEXT: movdqa %xmm4, %xmm2
; X86-NEXT: psrlq $32, %xmm2
-; X86-NEXT: pmuludq %xmm0, %xmm2
+; X86-NEXT: pmuludq %xmm3, %xmm2
; X86-NEXT: paddq %xmm1, %xmm2
; X86-NEXT: psllq $32, %xmm2
-; X86-NEXT: pmuludq %xmm0, %xmm3
-; X86-NEXT: paddq %xmm2, %xmm3
+; X86-NEXT: pmuludq %xmm3, %xmm4
+; X86-NEXT: paddq %xmm2, %xmm4
; X86-NEXT: movdqu {{[-0-9]+}}(%e{{[sb]}}p), %xmm0 # 16-byte Reload
-; X86-NEXT: psubq %xmm3, %xmm0
+; X86-NEXT: psubq %xmm4, %xmm0
; X86-NEXT: addl $64, %esp
; X86-NEXT: popl %esi
; X86-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/fold-tied-op.ll b/llvm/test/CodeGen/X86/fold-tied-op.ll
index 5ea2964057588f..d2e8465464e35a 100644
--- a/llvm/test/CodeGen/X86/fold-tied-op.ll
+++ b/llvm/test/CodeGen/X86/fold-tied-op.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd < %s | FileCheck %s
+; RUN: llc -verify-machineinstrs -mtriple=i386--netbsd -mcpu=generic < %s | FileCheck %s
; Regression test for http://reviews.llvm.org/D5701
@@ -25,81 +25,86 @@ define i64 @fn1() #0 {
; CHECK-NEXT: .cfi_offset %edi, -16
; CHECK-NEXT: .cfi_offset %ebx, -12
; CHECK-NEXT: movl $-1028477379, %ecx # imm = 0xC2B2AE3D
-; CHECK-NEXT: movl $668265295, %esi # imm = 0x27D4EB4F
-; CHECK-NEXT: movl a, %edi
-; CHECK-NEXT: cmpl $0, (%edi)
+; CHECK-NEXT: movl $668265295, %edi # imm = 0x27D4EB4F
+; CHECK-NEXT: movl a, %ebx
+; CHECK-NEXT: cmpl $0, (%ebx)
; CHECK-NEXT: je .LBB0_2
; CHECK-NEXT: # %bb.1: # %if.then
-; CHECK-NEXT: movl 8(%edi), %ecx
-; CHECK-NEXT: movl 12(%edi), %edx
-; CHECK-NEXT: movl %edx, %eax
+; CHECK-NEXT: movl 8(%ebx), %ecx
+; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 12(%ebx), %esi
+; CHECK-NEXT: movl %esi, %eax
; CHECK-NEXT: shldl $1, %ecx, %eax
-; CHECK-NEXT: orl %edx, %eax
-; CHECK-NEXT: leal (%ecx,%ecx), %edx
-; CHECK-NEXT: orl %ecx, %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 16(%edi), %ebx
-; CHECK-NEXT: movl 20(%edi), %edx
-; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: shldl $2, %ebx, %edx
-; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: shldl $31, %ebx, %ecx
-; CHECK-NEXT: shll $2, %ebx
-; CHECK-NEXT: orl %ecx, %ebx
+; CHECK-NEXT: orl %esi, %eax
+; CHECK-NEXT: movl 16(%ebx), %ecx
+; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 20(%ebx), %edx
+; CHECK-NEXT: movl %edx, %esi
+; CHECK-NEXT: shldl $2, %ecx, %esi
+; CHECK-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; CHECK-NEXT: leal (%esi,%esi), %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl %edx, %esi
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; CHECK-NEXT: shrl %ecx
-; CHECK-NEXT: orl %edx, %ecx
-; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; CHECK-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: adcl %eax, %ecx
+; CHECK-NEXT: shldl $31, %ecx, %esi
+; CHECK-NEXT: shll $2, %ecx
+; CHECK-NEXT: orl %esi, %ecx
+; CHECK-NEXT: shrl %edx
+; CHECK-NEXT: orl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; CHECK-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; CHECK-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl 24(%edi), %eax
+; CHECK-NEXT: adcl %eax, %edx
+; CHECK-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT: movl 24(%ebx), %ecx
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: mull %edi
; CHECK-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; CHECK-NEXT: movl $-1028477379, %ebx # imm = 0xC2B2AE3D
-; CHECK-NEXT: imull %eax, %ebx
-; CHECK-NEXT: mull %esi
-; CHECK-NEXT: movl %eax, %ecx
-; CHECK-NEXT: addl %ebx, %edx
-; CHECK-NEXT: movl 28(%edi), %edi
-; CHECK-NEXT: imull %edi, %esi
-; CHECK-NEXT: addl %edx, %esi
+; CHECK-NEXT: movl $-1028477379, %eax # imm = 0xC2B2AE3D
+; CHECK-NEXT: imull %ecx, %eax
+; CHECK-NEXT: addl %eax, %edx
+; CHECK-NEXT: movl 28(%ebx), %ebx
+; CHECK-NEXT: imull %ebx, %edi
+; CHECK-NEXT: addl %edx, %edi
; CHECK-NEXT: movl $1336530590, %edx # imm = 0x4FA9D69E
+; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: mull %edx
+; CHECK-NEXT: imull $-2056954758, %ecx, %esi # imm = 0x85655C7A
+; CHECK-NEXT: addl %edx, %esi
+; CHECK-NEXT: imull $1336530590, %ebx, %edx # imm = 0x4FA9D69E
+; CHECK-NEXT: addl %esi, %edx
; CHECK-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; CHECK-NEXT: shrdl $3, %edi, %ebx
+; CHECK-NEXT: sarl $3, %edi
+; CHECK-NEXT: orl %edx, %edi
+; CHECK-NEXT: orl %eax, %ebx
+; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87
; CHECK-NEXT: movl %ebx, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: imull $-2056954758, %ebx, %ebx # imm = 0x85655C7A
-; CHECK-NEXT: addl %edx, %ebx
-; CHECK-NEXT: imull $1336530590, %edi, %edx # imm = 0x4FA9D69E
-; CHECK-NEXT: addl %ebx, %edx
-; CHECK-NEXT: shrdl $3, %esi, %ecx
-; CHECK-NEXT: sarl $3, %esi
-; CHECK-NEXT: orl %edx, %esi
-; CHECK-NEXT: orl %eax, %ecx
-; CHECK-NEXT: movl $-66860409, %ebx # imm = 0xFC03CA87
-; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: mull %ebx
-; CHECK-NEXT: movl %eax, %edi
-; CHECK-NEXT: imull $326129324, %ecx, %eax # imm = 0x137056AC
-; CHECK-NEXT: addl %edx, %eax
-; CHECK-NEXT: imull $-66860409, %esi, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT: movl %eax, %esi
+; CHECK-NEXT: imull $-66860409, %edi, %ecx # imm = 0xFC03CA87
+; CHECK-NEXT: addl %edx, %ecx
+; CHECK-NEXT: imull $326129324, %ebx, %eax # imm = 0x137056AC
; CHECK-NEXT: addl %eax, %ecx
; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; CHECK-NEXT: movl %edi, b
-; CHECK-NEXT: movl %edi, %eax
-; CHECK-NEXT: mull %ebx
-; CHECK-NEXT: imull $326129324, %edi, %esi # imm = 0x137056AC
+; CHECK-NEXT: xorl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movl $-66860409, %edx # imm = 0xFC03CA87
+; CHECK-NEXT: mull %edx
+; CHECK-NEXT: movl %esi, b
+; CHECK-NEXT: imull $326129324, %esi, %esi # imm = 0x137056AC
; CHECK-NEXT: addl %edx, %esi
; CHECK-NEXT: movl %ecx, b+4
; CHECK-NEXT: imull $-66860409, %ecx, %ecx # imm = 0xFC03CA87
; CHECK-NEXT: jmp .LBB0_3
; CHECK-NEXT: .LBB0_2: # %if.else
-; CHECK-NEXT: xorl b+4, %ecx
-; CHECK-NEXT: xorl b, %esi
+; CHECK-NEXT: xorl b, %edi
; CHECK-NEXT: movl $1419758215, %edx # imm = 0x549FCA87
-; CHECK-NEXT: movl %esi, %eax
+; CHECK-NEXT: movl %edi, %eax
; CHECK-NEXT: mull %edx
-; CHECK-NEXT: imull $93298681, %esi, %esi # imm = 0x58F9FF9
+; CHECK-NEXT: xorl b+4, %ecx
+; CHECK-NEXT: imull $93298681, %edi, %esi # imm = 0x58F9FF9
; CHECK-NEXT: addl %edx, %esi
; CHECK-NEXT: imull $1419758215, %ecx, %ecx # imm = 0x549FCA87
; CHECK-NEXT: .LBB0_3: # %if.end
diff --git a/llvm/test/CodeGen/X86/horizontal-sum.ll b/llvm/test/CodeGen/X86/horizontal-sum.ll
index 5fe1e2996ee9b0..b124d4641c8f77 100644
--- a/llvm/test/CodeGen/X86/horizontal-sum.ll
+++ b/llvm/test/CodeGen/X86/horizontal-sum.ll
@@ -1,10 +1,16 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3 | FileCheck %s --check-prefixes=SSSE3-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+ssse3,fast-hops | FileCheck %s --check-prefixes=SSSE3-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
-; RUN: llc < %s -mtriple=x86_64-unknown -mattr=+avx2,fast-hops | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic -mattr=+ssse3 \
+; RUN: | FileCheck %s --check-prefixes=SSSE3-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic -mattr=+ssse3,fast-hops \
+; RUN: | FileCheck %s --check-prefixes=SSSE3-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic -mattr=+avx \
+; RUN: | FileCheck %s --check-prefixes=AVX-SLOW,AVX1-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic -mattr=+avx,fast-hops \
+; RUN: | FileCheck %s --check-prefixes=AVX-FAST,AVX1-FAST
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic -mattr=+avx2 \
+; RUN: | FileCheck %s --check-prefixes=AVX-SLOW,AVX2-SLOW
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic -mattr=+avx2,fast-hops \
+; RUN: | FileCheck %s --check-prefixes=AVX-FAST,AVX2-FAST
; Vectorized Pairwise Sum Reductions
; e.g.
@@ -102,11 +108,11 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[0,0,0,0]
@@ -125,11 +131,11 @@ define <4 x i32> @pair_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
+; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm1
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm1[1,1,1,1]
-; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm1, %xmm1
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm2[1,1,1,1]
+; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastd %xmm1, %xmm2
@@ -169,16 +175,16 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: haddps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: movaps %xmm0, %xmm1
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: haddps %xmm3, %xmm2
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm1 = xmm1[1,3],xmm0[1,3]
; SSSE3-SLOW-NEXT: haddps %xmm4, %xmm5
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-SLOW-NEXT: haddps %xmm5, %xmm2
-; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
-; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-SLOW-NEXT: addps %xmm1, %xmm0
; SSSE3-SLOW-NEXT: haddps %xmm7, %xmm6
+; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[0,1,3,2]
; SSSE3-SLOW-NEXT: haddps %xmm6, %xmm6
+; SSSE3-SLOW-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-SLOW-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,1]
; SSSE3-SLOW-NEXT: movaps %xmm2, %xmm1
; SSSE3-SLOW-NEXT: retq
@@ -188,12 +194,12 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm0
; SSSE3-FAST-NEXT: haddps %xmm0, %xmm0
; SSSE3-FAST-NEXT: haddps %xmm3, %xmm2
-; SSSE3-FAST-NEXT: haddps %xmm5, %xmm4
-; SSSE3-FAST-NEXT: haddps %xmm4, %xmm2
-; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
-; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6
-; SSSE3-FAST-NEXT: haddps %xmm6, %xmm4
; SSSE3-FAST-NEXT: movaps %xmm4, %xmm1
+; SSSE3-FAST-NEXT: haddps %xmm5, %xmm1
+; SSSE3-FAST-NEXT: haddps %xmm1, %xmm2
+; SSSE3-FAST-NEXT: haddps %xmm7, %xmm6
+; SSSE3-FAST-NEXT: haddps %xmm6, %xmm1
+; SSSE3-FAST-NEXT: movlhps {{.*#+}} xmm0 = xmm0[0],xmm2[0]
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: pair_sum_v8f32_v4f32:
@@ -201,20 +207,20 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX1-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm4
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
-; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm4
+; AVX1-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm1
; AVX1-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
-; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm1[0,1]
-; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[0]
-; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm3 = xmm2[0,2],xmm4[0,1]
+; AVX1-SLOW-NEXT: vinsertps {{.*#+}} xmm3 = xmm3[0,1,2],xmm1[0]
+; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[1,3]
+; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm2[0,1,2],xmm1[3]
; AVX1-SLOW-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
+; AVX1-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX1-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
+; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX1-SLOW-NEXT: retq
@@ -232,10 +238,10 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX1-FAST-NEXT: vaddps %xmm1, %xmm3, %xmm1
; AVX1-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
+; AVX1-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX1-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
+; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX1-FAST-NEXT: retq
@@ -243,22 +249,22 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-SLOW-LABEL: pair_sum_v8f32_v4f32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vhaddps %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm0[1,3,1,3]
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-SLOW-NEXT: vaddps %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm4, %xmm1
-; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm8
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm8 = xmm0[1,3,1,3]
; AVX2-SLOW-NEXT: vhaddps %xmm3, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,2,1,3]
+; AVX2-SLOW-NEXT: vhaddps %xmm5, %xmm5, %xmm3
+; AVX2-SLOW-NEXT: vaddps %xmm0, %xmm8, %xmm0
+; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm4
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
-; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-SLOW-NEXT: vhaddps %xmm4, %xmm5, %xmm3
-; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
+; AVX2-SLOW-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm3[0]
+; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm4[3,1]
; AVX2-SLOW-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-SLOW-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vhaddps %xmm7, %xmm6, %xmm2
+; AVX2-SLOW-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-SLOW-NEXT: vhaddps %xmm2, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX2-SLOW-NEXT: retq
@@ -270,16 +276,16 @@ define <8 x float> @pair_sum_v8f32_v4f32(<4 x float> %0, <4 x float> %1, <4 x fl
; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm4, %xmm1
; AVX2-FAST-NEXT: vhaddps %xmm5, %xmm5, %xmm8
; AVX2-FAST-NEXT: vhaddps %xmm3, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm2[0,2],xmm1[0,1]
; AVX2-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm8[0]
-; AVX2-FAST-NEXT: vhaddps %xmm4, %xmm5, %xmm3
; AVX2-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[1,3],xmm3[3,1]
; AVX2-FAST-NEXT: vaddps %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vmovlhps {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
-; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vhaddps %xmm7, %xmm6, %xmm2
+; AVX2-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm1[1,0]
; AVX2-FAST-NEXT: vhaddps %xmm0, %xmm2, %xmm2
+; AVX2-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX2-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX2-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX2-FAST-NEXT: retq
@@ -335,16 +341,16 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; SSSE3-SLOW-LABEL: pair_sum_v8i32_v4i32:
; SSSE3-SLOW: # %bb.0:
; SSSE3-SLOW-NEXT: phaddd %xmm1, %xmm0
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: phaddd %xmm3, %xmm2
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
; SSSE3-SLOW-NEXT: phaddd %xmm4, %xmm5
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; SSSE3-SLOW-NEXT: phaddd %xmm5, %xmm2
-; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2]
-; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-SLOW-NEXT: paddd %xmm1, %xmm0
; SSSE3-SLOW-NEXT: phaddd %xmm7, %xmm6
+; SSSE3-SLOW-NEXT: pshufd {{.*#+}} xmm1 = xmm2[0,1,3,2]
; SSSE3-SLOW-NEXT: phaddd %xmm6, %xmm6
+; SSSE3-SLOW-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; SSSE3-SLOW-NEXT: palignr {{.*#+}} xmm6 = xmm1[8,9,10,11,12,13,14,15],xmm6[0,1,2,3,4,5,6,7]
; SSSE3-SLOW-NEXT: movdqa %xmm6, %xmm1
; SSSE3-SLOW-NEXT: retq
@@ -353,38 +359,38 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; SSSE3-FAST: # %bb.0:
; SSSE3-FAST-NEXT: phaddd %xmm1, %xmm0
; SSSE3-FAST-NEXT: phaddd %xmm0, %xmm0
-; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm2
+; SSSE3-FAST-NEXT: movdqa %xmm2, %xmm1
+; SSSE3-FAST-NEXT: phaddd %xmm3, %xmm1
; SSSE3-FAST-NEXT: phaddd %xmm5, %xmm4
-; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm2
-; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm2[0]
+; SSSE3-FAST-NEXT: phaddd %xmm4, %xmm1
; SSSE3-FAST-NEXT: phaddd %xmm6, %xmm6
; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm7
; SSSE3-FAST-NEXT: phaddd %xmm7, %xmm6
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm2 = xmm2[2,3],xmm6[0,2]
-; SSSE3-FAST-NEXT: movaps %xmm2, %xmm1
+; SSSE3-FAST-NEXT: punpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,3],xmm6[0,2]
; SSSE3-FAST-NEXT: retq
;
; AVX1-SLOW-LABEL: pair_sum_v8i32_v4i32:
; AVX1-SLOW: # %bb.0:
; AVX1-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
-; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,3,1,3]
+; AVX1-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm5
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX1-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
+; AVX1-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm5 = xmm4[0,0,0,0]
-; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm5[6,7]
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm5[0,0,0,0]
+; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm3 = xmm3[0,1,2,3,4,5],xmm4[6,7]
; AVX1-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,3]
-; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX1-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3]
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2
+; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2
+; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-SLOW-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-SLOW-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX1-SLOW-NEXT: retq
@@ -404,10 +410,10 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX1-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
-; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
; AVX1-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2
+; AVX1-FAST-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm1
; AVX1-FAST-NEXT: vinsertf128 $1, %xmm2, %ymm0, %ymm0
; AVX1-FAST-NEXT: vshufpd {{.*#+}} ymm0 = ymm1[0],ymm0[1],ymm1[2],ymm0[2]
; AVX1-FAST-NEXT: retq
@@ -415,25 +421,25 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-SLOW-LABEL: pair_sum_v8i32_v4i32:
; AVX2-SLOW: # %bb.0:
; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm0, %xmm0
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[1,3,1,3]
-; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
-; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vphaddd %xmm4, %xmm4, %xmm1
-; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm4
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm0[1,3,1,3]
+; AVX2-SLOW-NEXT: vphaddd %xmm5, %xmm5, %xmm5
+; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
; AVX2-SLOW-NEXT: vphaddd %xmm3, %xmm2, %xmm2
+; AVX2-SLOW-NEXT: vpaddd %xmm4, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm3 = xmm2[0,2,2,3]
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm3[0],xmm1[0]
-; AVX2-SLOW-NEXT: vpbroadcastd %xmm4, %xmm5
-; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm5[3]
+; AVX2-SLOW-NEXT: vpbroadcastd %xmm5, %xmm4
+; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm3 = xmm3[0,1,2],xmm4[3]
; AVX2-SLOW-NEXT: vshufps {{.*#+}} xmm1 = xmm2[1,3],xmm1[1,1]
-; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
+; AVX2-SLOW-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm5[3]
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm2
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-SLOW-NEXT: vphaddd %xmm2, %xmm2, %xmm2
; AVX2-SLOW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-SLOW-NEXT: vphaddd %xmm7, %xmm6, %xmm1
-; AVX2-SLOW-NEXT: vphaddd %xmm1, %xmm1, %xmm1
-; AVX2-SLOW-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-SLOW-NEXT: vpbroadcastq %xmm2, %ymm1
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-SLOW-NEXT: retq
;
@@ -452,11 +458,11 @@ define <8 x i32> @pair_sum_v8i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i32> %2,
; AVX2-FAST-NEXT: vblendps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[3]
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
+; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm2
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[2,3,2,3]
+; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm2, %xmm2
; AVX2-FAST-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm0
-; AVX2-FAST-NEXT: vphaddd %xmm7, %xmm6, %xmm1
-; AVX2-FAST-NEXT: vphaddd %xmm0, %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpbroadcastq %xmm1, %ymm1
+; AVX2-FAST-NEXT: vpbroadcastq %xmm2, %ymm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} ymm0 = ymm0[0,1,2,3,4,5],ymm1[6,7]
; AVX2-FAST-NEXT: retq
%9 = shufflevector <4 x i32> %0, <4 x i32> poison, <2 x i32> <i32 0, i32 2>
@@ -552,9 +558,10 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; SSSE3-FAST-NEXT: unpckhps {{.*#+}} xmm0 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[3,3],xmm4[3,3]
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[2,3]
+; SSSE3-FAST-NEXT: movaps %xmm2, %xmm4
+; SSSE3-FAST-NEXT: haddps %xmm2, %xmm4
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm1 = xmm1[2,0],xmm2[3,3]
-; SSSE3-FAST-NEXT: haddps %xmm2, %xmm2
-; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm2[0,1]
+; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm5 = xmm5[0,2],xmm4[0,1]
; SSSE3-FAST-NEXT: addps %xmm5, %xmm0
; SSSE3-FAST-NEXT: addps %xmm1, %xmm0
; SSSE3-FAST-NEXT: movaps %xmm3, %xmm1
@@ -596,9 +603,9 @@ define <4 x float> @sequential_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[3],xmm1[1],zero,zero
; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm4[0,2],xmm1[0,1]
+; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[0,1],xmm2[3,3]
; AVX-FAST-NEXT: vblendps {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0,1,2],xmm4[0]
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0,1,2],xmm3[2]
; AVX-FAST-NEXT: vaddps %xmm1, %xmm2, %xmm1
@@ -687,7 +694,7 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX1-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX1-SLOW-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[1,1,1,1]
; AVX1-SLOW-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[0,0,0,0]
@@ -704,19 +711,19 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX1-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX1-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX1-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm2 = xmm5[0,1,2,3],xmm2[4,5,6,7]
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX1-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
-; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[0,0,0,0]
-; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX1-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm4[0,0,0,0]
+; AVX1-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; AVX1-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX1-FAST-NEXT: vpblendw {{.*#+}} xmm0 = xmm0[0,1,2,3,4,5],xmm1[6,7]
; AVX1-FAST-NEXT: retq
;
@@ -733,7 +740,7 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-SLOW-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX2-SLOW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX2-SLOW-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX2-SLOW-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; AVX2-SLOW-NEXT: vpbroadcastq %xmm3, %xmm1
; AVX2-SLOW-NEXT: vpbroadcastd %xmm3, %xmm2
@@ -750,19 +757,19 @@ define <4 x i32> @sequential_sum_v4i32_v4i32(<4 x i32> %0, <4 x i32> %1, <4 x i3
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[0,2,2,3]
; AVX2-FAST-NEXT: vpunpckhdq {{.*#+}} xmm5 = xmm0[2],xmm1[2],xmm0[3],xmm1[3]
; AVX2-FAST-NEXT: vpunpckhqdq {{.*#+}} xmm0 = xmm0[1],xmm1[1]
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FAST-NEXT: vphaddd %xmm2, %xmm2, %xmm1
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[1,3,2,3]
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm4[0],xmm1[0]
; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm4 = xmm2[3,3,3,3]
; AVX2-FAST-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm4[0]
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm2 = xmm5[0,1],xmm2[2,3]
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm4
+; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX2-FAST-NEXT: vphaddd %xmm3, %xmm3, %xmm1
-; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm2 = xmm3[2,2,2,2]
-; AVX2-FAST-NEXT: vpbroadcastd %xmm1, %xmm1
-; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm3, %xmm2
-; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm2, %xmm1
+; AVX2-FAST-NEXT: vpshufd {{.*#+}} xmm1 = xmm3[2,2,2,2]
+; AVX2-FAST-NEXT: vpbroadcastd %xmm4, %xmm2
+; AVX2-FAST-NEXT: vpaddd %xmm1, %xmm3, %xmm1
+; AVX2-FAST-NEXT: vpaddd %xmm2, %xmm1, %xmm1
; AVX2-FAST-NEXT: vpblendd {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[3]
; AVX2-FAST-NEXT: retq
%5 = shufflevector <4 x i32> %0, <4 x i32> %1, <2 x i32> <i32 0, i32 4>
@@ -840,9 +847,9 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm0[1]
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
; SSSE3-FAST-NEXT: shufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
-; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm4
; SSSE3-FAST-NEXT: haddps %xmm1, %xmm4
+; SSSE3-FAST-NEXT: addss %xmm5, %xmm0
; SSSE3-FAST-NEXT: movaps %xmm1, %xmm5
; SSSE3-FAST-NEXT: unpckhpd {{.*#+}} xmm5 = xmm5[1],xmm1[1]
; SSSE3-FAST-NEXT: addss %xmm4, %xmm5
@@ -903,23 +910,23 @@ define <4 x float> @reduction_sum_v4f32_v4f32(<4 x float> %0, <4 x float> %1, <4
; AVX-FAST-NEXT: vhaddps %xmm0, %xmm0, %xmm4
; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm0[1,0]
; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
+; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm5
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm0 = xmm0[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm0, %xmm4, %xmm0
-; AVX-FAST-NEXT: vhaddps %xmm1, %xmm1, %xmm4
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm5 = xmm1[1,0]
-; AVX-FAST-NEXT: vaddss %xmm5, %xmm4, %xmm4
+; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm1[1,0]
+; AVX-FAST-NEXT: vaddss %xmm4, %xmm5, %xmm4
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm1 = xmm1[3,3,3,3]
+; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm5
; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[2,3]
-; AVX-FAST-NEXT: vhaddps %xmm2, %xmm2, %xmm1
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm4 = xmm2[1,0]
-; AVX-FAST-NEXT: vaddss %xmm4, %xmm1, %xmm1
+; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm2[1,0]
+; AVX-FAST-NEXT: vaddss %xmm1, %xmm5, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm2[3,3,3,3]
+; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm4
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1],xmm1[0],xmm0[3]
-; AVX-FAST-NEXT: vhaddps %xmm3, %xmm3, %xmm1
-; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm2 = xmm3[1,0]
-; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
+; AVX-FAST-NEXT: vshufpd {{.*#+}} xmm1 = xmm3[1,0]
+; AVX-FAST-NEXT: vaddss %xmm1, %xmm4, %xmm1
; AVX-FAST-NEXT: vshufps {{.*#+}} xmm2 = xmm3[3,3,3,3]
; AVX-FAST-NEXT: vaddss %xmm2, %xmm1, %xmm1
; AVX-FAST-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],xmm1[0]
diff --git a/llvm/test/CodeGen/X86/is_fpclass.ll b/llvm/test/CodeGen/X86/is_fpclass.ll
index 97136dafa6c2c0..e2ebc27c7e331d 100644
--- a/llvm/test/CodeGen/X86/is_fpclass.ll
+++ b/llvm/test/CodeGen/X86/is_fpclass.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-linux | FileCheck %s -check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefixes=X64,X64-GENERIC
-; RUN: llc < %s -mtriple=x86_64-linux -mattr=+ndd | FileCheck %s -check-prefixes=X64,X64-NDD
+; RUN: llc < %s -mtriple=i686-linux -mcpu=generic | FileCheck %s -check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=generic | FileCheck %s -check-prefixes=X64,X64-GENERIC
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=generic -mattr=+ndd | FileCheck %s -check-prefixes=X64,X64-NDD
define i1 @isnan_f(float %x) {
; X86-LABEL: isnan_f:
@@ -1591,35 +1591,30 @@ entry:
define <4 x i1> @isnan_v4f_strictfp(<4 x float> %x) strictfp {
; X86-LABEL: isnan_v4f_strictfp:
; X86: # %bb.0: # %entry
-; X86-NEXT: pushl %esi
-; X86-NEXT: .cfi_def_cfa_offset 8
-; X86-NEXT: .cfi_offset %esi, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl $2147483647, %ecx # imm = 0x7FFFFFFF
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001
+; X86-NEXT: setge %al
+; X86-NEXT: shlb $2, %al
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: andl %ecx, %edx
; X86-NEXT: cmpl $2139095041, %edx # imm = 0x7F800001
-; X86-NEXT: setge %dh
-; X86-NEXT: shlb $2, %dh
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: andl %ecx, %esi
-; X86-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001
; X86-NEXT: setge %dl
; X86-NEXT: shlb $3, %dl
-; X86-NEXT: orb %dh, %dl
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: andl %ecx, %esi
-; X86-NEXT: cmpl $2139095041, %esi # imm = 0x7F800001
+; X86-NEXT: orb %al, %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: andl %ecx, %eax
+; X86-NEXT: cmpl $2139095041, %eax # imm = 0x7F800001
; X86-NEXT: setge %dh
; X86-NEXT: andl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpl $2139095041, %ecx # imm = 0x7F800001
; X86-NEXT: setge %cl
; X86-NEXT: addb %cl, %cl
; X86-NEXT: orb %dh, %cl
; X86-NEXT: orb %dl, %cl
; X86-NEXT: movb %cl, (%eax)
-; X86-NEXT: popl %esi
-; X86-NEXT: .cfi_def_cfa_offset 4
; X86-NEXT: retl $4
;
; X64-LABEL: isnan_v4f_strictfp:
diff --git a/llvm/test/CodeGen/X86/lea-opt-cse4.ll b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
index 4fa9acd99bb2fa..2079eb50608f70 100644
--- a/llvm/test/CodeGen/X86/lea-opt-cse4.ll
+++ b/llvm/test/CodeGen/X86/lea-opt-cse4.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s -check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=generic | FileCheck %s -check-prefix=X86
%struct.SA = type { i32 , i32 , i32 , i32 , i32};
@@ -13,9 +13,11 @@ define void @foo(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: addl %eax, %ecx
; X64-NEXT: leal (%rcx,%rax), %edx
-; X64-NEXT: leal 1(%rax,%rcx), %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: incl %ecx
; X64-NEXT: movl %ecx, 12(%rdi)
-; X64-NEXT: leal 1(%rax,%rdx), %eax
+; X64-NEXT: addl %edx, %eax
+; X64-NEXT: incl %eax
; X64-NEXT: movl %eax, 16(%rdi)
; X64-NEXT: retq
;
@@ -30,10 +32,12 @@ define void @foo(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: leal 1(%ecx,%edx), %esi
+; X86-NEXT: leal (%ecx,%edx), %esi
+; X86-NEXT: incl %esi
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl %esi, 12(%eax)
-; X86-NEXT: leal 1(%ecx,%edx), %ecx
+; X86-NEXT: addl %edx, %ecx
+; X86-NEXT: incl %ecx
; X86-NEXT: movl %ecx, 16(%eax)
; X86-NEXT: popl %esi
; X86-NEXT: .cfi_def_cfa_offset 4
@@ -64,13 +68,15 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X64-NEXT: # =>This Inner Loop Header: Depth=1
; X64-NEXT: movl (%rdi), %ecx
; X64-NEXT: movl 16(%rdi), %eax
-; X64-NEXT: leal 1(%rcx,%rax), %edx
+; X64-NEXT: leal (%rcx,%rax), %edx
+; X64-NEXT: incl %edx
; X64-NEXT: movl %edx, 12(%rdi)
; X64-NEXT: decl %esi
; X64-NEXT: jne .LBB1_1
; X64-NEXT: # %bb.2: # %exit
; X64-NEXT: addl %eax, %ecx
-; X64-NEXT: leal 1(%rax,%rcx), %ecx
+; X64-NEXT: addl %eax, %ecx
+; X64-NEXT: incl %ecx
; X64-NEXT: leal (%rax,%rax), %edx
; X64-NEXT: addl %eax, %edx
; X64-NEXT: addl %edx, %ecx
@@ -93,13 +99,15 @@ define void @foo_loop(ptr nocapture %ctx, i32 %n) local_unnamed_addr #0 {
; X86-NEXT: # =>This Inner Loop Header: Depth=1
; X86-NEXT: movl (%eax), %esi
; X86-NEXT: movl 16(%eax), %ecx
-; X86-NEXT: leal 1(%esi,%ecx), %edi
+; X86-NEXT: leal (%esi,%ecx), %edi
+; X86-NEXT: incl %edi
; X86-NEXT: movl %edi, 12(%eax)
; X86-NEXT: decl %edx
; X86-NEXT: jne .LBB1_1
; X86-NEXT: # %bb.2: # %exit
; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: leal 1(%ecx,%esi), %edx
+; X86-NEXT: leal (%ecx,%esi), %edx
+; X86-NEXT: incl %edx
; X86-NEXT: leal (%ecx,%ecx), %esi
; X86-NEXT: addl %ecx, %esi
; X86-NEXT: addl %esi, %edx
diff --git a/llvm/test/CodeGen/X86/machine-cp.ll b/llvm/test/CodeGen/X86/machine-cp.ll
index c84a1159ad56a6..609532b0197564 100644
--- a/llvm/test/CodeGen/X86/machine-cp.ll
+++ b/llvm/test/CodeGen/X86/machine-cp.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -mattr=+sse2 -verify-machineinstrs | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -mcpu=x86-64 -mattr=+sse2 -verify-machineinstrs | FileCheck %s
; After tail duplication, two copies in an early exit BB can be cancelled out.
; rdar://10640363
@@ -17,8 +17,8 @@ define i32 @t1(i32 %a, i32 %b) nounwind {
; CHECK-NEXT: movl %edx, %ecx
; CHECK-NEXT: cltd
; CHECK-NEXT: idivl %ecx
-; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: movl %ecx, %eax
+; CHECK-NEXT: testl %edx, %edx
; CHECK-NEXT: jne LBB0_2
; CHECK-NEXT: ## %bb.3: ## %while.end
; CHECK-NEXT: movl %ecx, %eax
@@ -59,21 +59,34 @@ define i32 @t3(i64 %a, i64 %b) nounwind {
; CHECK: ## %bb.0: ## %entry
; CHECK-NEXT: movq %rdi, %rax
; CHECK-NEXT: testq %rsi, %rsi
-; CHECK-NEXT: je LBB2_4
-; CHECK-NEXT: ## %bb.1: ## %while.body.preheader
+; CHECK-NEXT: je LBB2_7
+; CHECK-NEXT: ## %bb.1:
; CHECK-NEXT: movq %rsi, %rdx
+; CHECK-NEXT: jmp LBB2_4
; CHECK-NEXT: .p2align 4
-; CHECK-NEXT: LBB2_2: ## %while.body
-; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
-; CHECK-NEXT: movq %rdx, %rcx
+; CHECK-NEXT: LBB2_2: ## in Loop: Header=BB2_4 Depth=1
; CHECK-NEXT: cqto
; CHECK-NEXT: idivq %rcx
-; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: LBB2_3: ## in Loop: Header=BB2_4 Depth=1
; CHECK-NEXT: movq %rcx, %rax
+; CHECK-NEXT: testq %rdx, %rdx
+; CHECK-NEXT: je LBB2_6
+; CHECK-NEXT: LBB2_4: ## %while.body
+; CHECK-NEXT: ## =>This Inner Loop Header: Depth=1
+; CHECK-NEXT: movq %rdx, %rcx
+; CHECK-NEXT: movq %rax, %rdx
+; CHECK-NEXT: orq %rcx, %rdx
+; CHECK-NEXT: shrq $32, %rdx
; CHECK-NEXT: jne LBB2_2
-; CHECK-NEXT: ## %bb.3: ## %while.end
+; CHECK-NEXT: ## %bb.5: ## in Loop: Header=BB2_4 Depth=1
+; CHECK-NEXT: ## kill: def $eax killed $eax killed $rax
+; CHECK-NEXT: xorl %edx, %edx
+; CHECK-NEXT: divl %ecx
+; CHECK-NEXT: ## kill: def $edx killed $edx def $rdx
+; CHECK-NEXT: jmp LBB2_3
+; CHECK-NEXT: LBB2_6: ## %while.end
; CHECK-NEXT: movl %ecx, %eax
-; CHECK-NEXT: LBB2_4:
+; CHECK-NEXT: LBB2_7:
; CHECK-NEXT: retq
entry:
%cmp1 = icmp eq i64 %b, 0
diff --git a/llvm/test/CodeGen/X86/madd.ll b/llvm/test/CodeGen/X86/madd.ll
index bdb7c307a57590..6f1d7b0f67b031 100644
--- a/llvm/test/CodeGen/X86/madd.ll
+++ b/llvm/test/CodeGen/X86/madd.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefix=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX256,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX256,AVX512,AVX512BW
define i32 @_Z10test_shortPsS_i_128(ptr nocapture readonly, ptr nocapture readonly, i32) local_unnamed_addr #0 {
; SSE2-LABEL: _Z10test_shortPsS_i_128:
@@ -454,9 +454,9 @@ define i32 @_Z10test_shortPsS_i_1024(ptr nocapture readonly, ptr nocapture reado
; AVX2-NEXT: vmovdqu (%rsi,%rcx,2), %ymm3
; AVX2-NEXT: vmovdqu 32(%rsi,%rcx,2), %ymm4
; AVX2-NEXT: vpmaddwd (%rdi,%rcx,2), %ymm3, %ymm3
+; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm4
; AVX2-NEXT: vpaddd %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vpmaddwd 32(%rdi,%rcx,2), %ymm4, %ymm3
-; AVX2-NEXT: vpaddd %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpaddd %ymm2, %ymm4, %ymm2
; AVX2-NEXT: addq $16, %rcx
; AVX2-NEXT: cmpq %rcx, %rax
; AVX2-NEXT: jne .LBB3_1
@@ -817,9 +817,9 @@ define i32 @_Z9test_charPcS_i_512(ptr nocapture readonly, ptr nocapture readonly
; AVX1-NEXT: vpmovsxbw 8(%rdi,%rcx), %xmm2
; AVX1-NEXT: vpmovsxbw (%rdi,%rcx), %xmm3
; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm4
+; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm5
; AVX1-NEXT: vpmaddwd %xmm2, %xmm4, %xmm2
-; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm4
-; AVX1-NEXT: vpmaddwd %xmm3, %xmm4, %xmm3
+; AVX1-NEXT: vpmaddwd %xmm3, %xmm5, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
@@ -1008,9 +1008,9 @@ define i32 @_Z9test_charPcS_i_1024(ptr nocapture readonly, ptr nocapture readonl
; AVX1-NEXT: vpmovsxbw 16(%rsi,%rcx), %xmm7
; AVX1-NEXT: vpmaddwd %xmm4, %xmm7, %xmm4
; AVX1-NEXT: vpmovsxbw 8(%rsi,%rcx), %xmm7
+; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm8
; AVX1-NEXT: vpmaddwd %xmm5, %xmm7, %xmm5
-; AVX1-NEXT: vpmovsxbw (%rsi,%rcx), %xmm7
-; AVX1-NEXT: vpmaddwd %xmm6, %xmm7, %xmm6
+; AVX1-NEXT: vpmaddwd %xmm6, %xmm8, %xmm6
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm7
; AVX1-NEXT: vpaddd %xmm7, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm1, %xmm4, %xmm1
@@ -1297,9 +1297,9 @@ define i32 @test_unsigned_short_256(ptr nocapture readonly, ptr nocapture readon
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm1 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm2 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm4 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm1, %xmm3, %xmm1
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm3 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm2, %xmm3, %xmm2
+; AVX1-NEXT: vpmulld %xmm2, %xmm4, %xmm2
; AVX1-NEXT: vextractf128 $1, %ymm0, %xmm3
; AVX1-NEXT: vpaddd %xmm3, %xmm1, %xmm1
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
@@ -1439,9 +1439,9 @@ define i32 @test_unsigned_short_512(ptr nocapture readonly, ptr nocapture readon
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm3, %xmm6, %xmm3
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm7 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm4, %xmm6, %xmm4
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm6 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm5, %xmm6, %xmm5
+; AVX1-NEXT: vpmulld %xmm5, %xmm7, %xmm5
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm6
; AVX1-NEXT: vpaddd %xmm6, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
@@ -1658,17 +1658,17 @@ define i32 @test_unsigned_short_1024(ptr nocapture readonly, ptr nocapture reado
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm5, %xmm12, %xmm5
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm13 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm6, %xmm12, %xmm6
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm7, %xmm12, %xmm7
+; AVX1-NEXT: vpmulld %xmm7, %xmm13, %xmm7
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm8, %xmm12, %xmm8
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm9, %xmm12, %xmm9
; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
+; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm13 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
; AVX1-NEXT: vpmulld %xmm10, %xmm12, %xmm10
-; AVX1-NEXT: vpmovzxwd {{.*#+}} xmm12 = mem[0],zero,mem[1],zero,mem[2],zero,mem[3],zero
-; AVX1-NEXT: vpmulld %xmm11, %xmm12, %xmm11
+; AVX1-NEXT: vpmulld %xmm11, %xmm13, %xmm11
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm12
; AVX1-NEXT: vpaddd %xmm4, %xmm12, %xmm4
; AVX1-NEXT: vpaddd %xmm1, %xmm5, %xmm1
@@ -2194,7 +2194,6 @@ define <16 x i32> @jumbled_indices16(<32 x i16> %A, <32 x i16> %B) {
define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
; SSE2-LABEL: jumbled_indices32:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm0
; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm1
; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm2
@@ -2202,6 +2201,7 @@ define <32 x i32> @jumbled_indices32(<64 x i16> %A, <64 x i16> %B) {
; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm4
; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm5
; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: pmaddwd {{[0-9]+}}(%rsp), %xmm7
; SSE2-NEXT: movdqa %xmm7, 112(%rdi)
; SSE2-NEXT: movdqa %xmm6, 96(%rdi)
@@ -2363,10 +2363,10 @@ define <16 x i32> @pmaddwd_512(ptr %Aptr, ptr %Bptr) {
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1
; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1
+; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm3
; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
; AVX1-NEXT: retq
;
; AVX2-LABEL: pmaddwd_512:
@@ -2410,7 +2410,6 @@ define <16 x i32> @pmaddwd_512(ptr %Aptr, ptr %Bptr) {
define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) {
; SSE2-LABEL: pmaddwd_1024:
; SSE2: # %bb.0:
-; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: movdqa (%rsi), %xmm0
; SSE2-NEXT: movdqa 16(%rsi), %xmm1
; SSE2-NEXT: movdqa 32(%rsi), %xmm2
@@ -2423,6 +2422,7 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) {
; SSE2-NEXT: pmaddwd 64(%rdx), %xmm4
; SSE2-NEXT: movdqa 80(%rsi), %xmm5
; SSE2-NEXT: pmaddwd 80(%rdx), %xmm5
+; SSE2-NEXT: movq %rdi, %rax
; SSE2-NEXT: movdqa 96(%rsi), %xmm6
; SSE2-NEXT: pmaddwd 96(%rdx), %xmm6
; SSE2-NEXT: movdqa 112(%rsi), %xmm7
@@ -2445,15 +2445,15 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) {
; AVX1-NEXT: vmovdqa 48(%rdi), %xmm3
; AVX1-NEXT: vpmaddwd 16(%rsi), %xmm1, %xmm1
; AVX1-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
-; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm1
+; AVX1-NEXT: vpmaddwd 48(%rsi), %xmm3, %xmm3
; AVX1-NEXT: vpmaddwd 32(%rsi), %xmm2, %xmm2
-; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm2, %ymm1
-; AVX1-NEXT: vmovdqa 80(%rdi), %xmm2
-; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm2, %xmm2
-; AVX1-NEXT: vmovdqa 64(%rdi), %xmm3
-; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm3, %xmm3
-; AVX1-NEXT: vinsertf128 $1, %xmm2, %ymm3, %ymm2
+; AVX1-NEXT: vinsertf128 $1, %xmm1, %ymm0, %ymm0
+; AVX1-NEXT: vmovdqa 80(%rdi), %xmm1
+; AVX1-NEXT: vpmaddwd 80(%rsi), %xmm1, %xmm4
+; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm2, %ymm1
+; AVX1-NEXT: vmovdqa 64(%rdi), %xmm2
+; AVX1-NEXT: vpmaddwd 64(%rsi), %xmm2, %xmm2
+; AVX1-NEXT: vinsertf128 $1, %xmm4, %ymm2, %ymm2
; AVX1-NEXT: vmovdqa 112(%rdi), %xmm3
; AVX1-NEXT: vpmaddwd 112(%rsi), %xmm3, %xmm3
; AVX1-NEXT: vmovdqa 96(%rdi), %xmm4
@@ -2481,10 +2481,10 @@ define <32 x i32> @pmaddwd_1024(ptr %Aptr, ptr %Bptr) {
; AVX512F-NEXT: vmovdqa 96(%rdi), %ymm3
; AVX512F-NEXT: vpmaddwd 32(%rsi), %ymm1, %ymm1
; AVX512F-NEXT: vpmaddwd (%rsi), %ymm0, %ymm0
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm1
+; AVX512F-NEXT: vpmaddwd 96(%rsi), %ymm3, %ymm3
; AVX512F-NEXT: vpmaddwd 64(%rsi), %ymm2, %ymm2
-; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm0, %zmm0
+; AVX512F-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm1
; AVX512F-NEXT: retq
;
; AVX512BW-LABEL: pmaddwd_1024:
@@ -2679,18 +2679,18 @@ define i32 @madd_quad_reduction(ptr %arg, ptr %arg1, ptr %arg2, ptr %arg3, ptr %
;
; AVX-LABEL: madd_quad_reduction:
; AVX: # %bb.0:
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX-NEXT: movq {{[0-9]+}}(%rsp), %r10
; AVX-NEXT: vmovdqu (%rdi), %xmm0
; AVX-NEXT: vpmaddwd (%rsi), %xmm0, %xmm0
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX-NEXT: vmovdqu (%rdx), %xmm1
; AVX-NEXT: vpmaddwd (%rcx), %xmm1, %xmm1
+; AVX-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: vmovdqu (%r8), %xmm2
+; AVX-NEXT: vpmaddwd (%r9), %xmm2, %xmm2
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; AVX-NEXT: vmovdqu (%r8), %xmm1
-; AVX-NEXT: vpmaddwd (%r9), %xmm1, %xmm1
-; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
-; AVX-NEXT: vmovdqu (%r10), %xmm1
+; AVX-NEXT: vmovdqu (%rcx), %xmm1
; AVX-NEXT: vpmaddwd (%rax), %xmm1, %xmm1
+; AVX-NEXT: vpaddd %xmm2, %xmm0, %xmm0
; AVX-NEXT: vpaddd %xmm0, %xmm1, %xmm0
; AVX-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2739,6 +2739,7 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
; SSE2: # %bb.0: # %entry
; SSE2-NEXT: movl %esi, %eax
; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: xorl %ecx, %ecx
; SSE2-NEXT: pxor %xmm1, %xmm1
; SSE2-NEXT: pxor %xmm2, %xmm2
; SSE2-NEXT: pxor %xmm3, %xmm3
@@ -2755,8 +2756,8 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
; SSE2-NEXT: paddd %xmm5, %xmm3
; SSE2-NEXT: pmaddwd %xmm4, %xmm4
; SSE2-NEXT: paddd %xmm4, %xmm1
-; SSE2-NEXT: addq $8, %rdi
-; SSE2-NEXT: addq $-8, %rax
+; SSE2-NEXT: addq $8, %rcx
+; SSE2-NEXT: cmpq %rcx, %rax
; SSE2-NEXT: jne .LBB33_1
; SSE2-NEXT: # %bb.2: # %middle.block
; SSE2-NEXT: paddd %xmm3, %xmm2
@@ -2779,6 +2780,7 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
; AVX1: # %bb.0: # %entry
; AVX1-NEXT: movl %esi, %eax
; AVX1-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX1-NEXT: xorl %ecx, %ecx
; AVX1-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX1-NEXT: .p2align 4
; AVX1-NEXT: .LBB33_1: # %vector.body
@@ -2795,8 +2797,8 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
; AVX1-NEXT: vpaddd %xmm4, %xmm3, %xmm3
; AVX1-NEXT: vpaddd %xmm0, %xmm2, %xmm0
; AVX1-NEXT: vinsertf128 $1, %xmm3, %ymm0, %ymm0
-; AVX1-NEXT: addq $8, %rdi
-; AVX1-NEXT: addq $-8, %rax
+; AVX1-NEXT: addq $8, %rcx
+; AVX1-NEXT: cmpq %rcx, %rax
; AVX1-NEXT: jne .LBB33_1
; AVX1-NEXT: # %bb.2: # %middle.block
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm2
@@ -2822,6 +2824,7 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
; AVX256: # %bb.0: # %entry
; AVX256-NEXT: movl %esi, %eax
; AVX256-NEXT: vpxor %xmm0, %xmm0, %xmm0
+; AVX256-NEXT: xorl %ecx, %ecx
; AVX256-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX256-NEXT: .p2align 4
; AVX256-NEXT: .LBB33_1: # %vector.body
@@ -2830,8 +2833,8 @@ define i64 @sum_and_sum_of_squares(ptr %a, i32 %n) {
; AVX256-NEXT: vpaddd %ymm1, %ymm2, %ymm1
; AVX256-NEXT: vpmaddwd %ymm2, %ymm2, %ymm2
; AVX256-NEXT: vpaddd %ymm0, %ymm2, %ymm0
-; AVX256-NEXT: addq $8, %rdi
-; AVX256-NEXT: addq $-8, %rax
+; AVX256-NEXT: addq $8, %rcx
+; AVX256-NEXT: cmpq %rcx, %rax
; AVX256-NEXT: jne .LBB33_1
; AVX256-NEXT: # %bb.2: # %middle.block
; AVX256-NEXT: vextracti128 $1, %ymm1, %xmm2
@@ -3149,9 +3152,9 @@ define i32 @add_used_by_loop_phi(ptr %a, ptr %b, i64 %offset_a, i64 %offset_b, i
; AVX1-NEXT: vpmovsxbw 8(%rdi,%rax), %xmm2
; AVX1-NEXT: vpmovsxbw (%rdi,%rax), %xmm3
; AVX1-NEXT: vpmovsxbw 8(%rsi,%rax), %xmm4
+; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm5
; AVX1-NEXT: vpmaddwd %xmm4, %xmm2, %xmm2
-; AVX1-NEXT: vpmovsxbw (%rsi,%rax), %xmm4
-; AVX1-NEXT: vpmaddwd %xmm4, %xmm3, %xmm3
+; AVX1-NEXT: vpmaddwd %xmm5, %xmm3, %xmm3
; AVX1-NEXT: vextractf128 $1, %ymm1, %xmm4
; AVX1-NEXT: vpaddd %xmm4, %xmm2, %xmm2
; AVX1-NEXT: vpaddd %xmm1, %xmm3, %xmm1
diff --git a/llvm/test/CodeGen/X86/masked_gather_scatter.ll b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
index 1289eef7795dcc..c31c34752d3a15 100644
--- a/llvm/test/CodeGen/X86/masked_gather_scatter.ll
+++ b/llvm/test/CodeGen/X86/masked_gather_scatter.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
-; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
-; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
-; RUN: llc -mtriple=i386-unknown-linux-gnu -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=knl -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_64
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=knl -mattr=+avx512f < %s | FileCheck %s --check-prefix=ALL --check-prefix=KNL_32
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_SMALL
+; RUN: llc -mtriple=x86_64-unknown-linux-gnu -mcpu=skx -mattr=+avx512vl -mattr=+avx512dq -code-model=large < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX --check-prefix=SKX_LARGE
+; RUN: llc -mtriple=i386-unknown-linux-gnu -mcpu=skx -mattr=+avx512vl -mattr=+avx512dq < %s | FileCheck %s --check-prefix=ALL --check-prefix=SKX_32
; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
; RUN: opt -mtriple=x86_64-apple-darwin -passes=scalarize-masked-mem-intrin -mcpu=corei7-avx -S < %s | FileCheck %s -check-prefix=SCALAR
; RUN: llc -O0 -mtriple=x86_64-unknown-linux-gnu -mcpu=skx < %s -o /dev/null
@@ -98,7 +98,7 @@ define <16 x float> @test2(ptr %base, <16 x i32> %ind, i16 %mask) {
;
; SKX-LABEL: test2:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
@@ -143,7 +143,7 @@ define <16 x i32> @test3(ptr %base, <16 x i32> %ind, i16 %mask) {
;
; SKX-LABEL: test3:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm0
@@ -195,9 +195,9 @@ define <16 x i32> @test4(ptr %base, <16 x i32> %ind, i16 %mask) {
;
; SKX-LABEL: test4:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: kmovq %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm1 {%k2}
; SKX-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX-NEXT: vpgatherdd (%rdi,%zmm0,4), %zmm2 {%k1}
@@ -209,7 +209,7 @@ define <16 x i32> @test4(ptr %base, <16 x i32> %ind, i16 %mask) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX_32-NEXT: kmovw %k1, %k2
+; SKX_32-NEXT: kmovq %k1, %k2
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX_32-NEXT: vpgatherdd (%eax,%zmm0,4), %zmm2 {%k1}
@@ -249,7 +249,6 @@ define void @test5(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; KNL_64-NEXT: kmovw %k1, %k2
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test5:
@@ -259,13 +258,12 @@ define void @test5(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; KNL_32-NEXT: kmovw %k1, %k2
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test5:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k1
-; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: kmovd %esi, %k1
+; SKX-NEXT: kmovq %k1, %k2
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k2}
; SKX-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
; SKX-NEXT: vzeroupper
@@ -275,7 +273,7 @@ define void @test5(ptr %base, <16 x i32> %ind, i16 %mask, <16 x i32>%val) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
-; SKX_32-NEXT: kmovw %k1, %k2
+; SKX_32-NEXT: kmovq %k1, %k2
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k2}
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
; SKX_32-NEXT: vzeroupper
@@ -340,10 +338,10 @@ define <8 x i32> @test6(<8 x i32>%a1, <8 x ptr> %ptr) {
;
; SKX_32-LABEL: test6:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX_32-NEXT: kxnorw %k0, %k0, %k2
-; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k2}
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
+; SKX_32-NEXT: vpgatherdd (,%ymm1), %ymm2 {%k1}
+; SKX_32-NEXT: kxnorw %k0, %k0, %k1
; SKX_32-NEXT: vpscatterdd %ymm0, (,%ymm1) {%k1}
; SKX_32-NEXT: vmovdqa %ymm2, %ymm0
; SKX_32-NEXT: retl
@@ -388,9 +386,9 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
;
; SKX-LABEL: test7:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX-NEXT: kmovw %k1, %k2
+; SKX-NEXT: kmovq %k1, %k2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm1 {%k2}
; SKX-NEXT: vmovdqa %ymm1, %ymm2
; SKX-NEXT: vpgatherdd (%rdi,%ymm0,4), %ymm2 {%k1}
@@ -402,7 +400,7 @@ define <8 x i32> @test7(ptr %base, <8 x i32> %ind, i8 %mask) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX_32-NEXT: kmovw %k1, %k2
+; SKX_32-NEXT: kmovq %k1, %k2
; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm1 {%k2}
; SKX_32-NEXT: vmovdqa %ymm1, %ymm2
; SKX_32-NEXT: vpgatherdd (%eax,%ymm0,4), %ymm2 {%k1}
@@ -453,18 +451,18 @@ define <16 x i32> @test8(<16 x ptr> %ptr.random, <16 x i32> %ind, i16 %mask) {
;
; SKX-LABEL: test8:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: kshiftrw $8, %k1, %k2
+; SKX-NEXT: kmovq %k2, %k3
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: kmovw %k2, %k3
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k3}
; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
-; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k3}
-; SKX-NEXT: kmovw %k1, %k3
-; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k3}
-; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm4
-; SKX-NEXT: vpgatherqd (,%zmm1), %ymm3 {%k2}
-; SKX-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
-; SKX-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm0
+; SKX-NEXT: kmovq %k1, %k3
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k3}
+; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm4
+; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
+; SKX-NEXT: vpgatherqd (,%zmm0), %ymm3 {%k1}
+; SKX-NEXT: vinserti64x4 $1, %ymm2, %zmm3, %zmm0
; SKX-NEXT: vpaddd %zmm0, %zmm4, %zmm0
; SKX-NEXT: retq
;
@@ -472,7 +470,7 @@ define <16 x i32> @test8(<16 x ptr> %ptr.random, <16 x i32> %ind, i16 %mask) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: vpxor %xmm1, %xmm1, %xmm1
-; SKX_32-NEXT: kmovw %k1, %k2
+; SKX_32-NEXT: kmovq %k1, %k2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm1 {%k2}
; SKX_32-NEXT: vmovdqa64 %zmm1, %zmm2
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
@@ -533,10 +531,10 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; SKX_SMALL: # %bb.0: # %entry
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm1, %zmm1
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -548,10 +546,10 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
+; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm2, %zmm1
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm1, %zmm1
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -559,9 +557,11 @@ define <8 x i32> @test9(ptr %base, <8 x i64> %ind1, <8 x i32>%ind5) {
;
; SKX_32-LABEL: test9:
; SKX_32: # %bb.0: # %entry
-; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1
+; SKX_32-NEXT: vpslld $4, %ymm1, %ymm2
; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
+; SKX_32-NEXT: vpslld $6, %ymm1, %ymm1
+; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
@@ -617,10 +617,10 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
; SKX_SMALL: # %bb.0: # %entry
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %zmm2
; SKX_SMALL-NEXT: vpmullq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm2, %zmm0
; SKX_SMALL-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_SMALL-NEXT: vpmuldq {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %zmm1, %zmm1
-; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm1, %zmm2, %zmm1
+; SKX_SMALL-NEXT: vpaddq %zmm0, %zmm1, %zmm1
; SKX_SMALL-NEXT: kxnorw %k0, %k0, %k1
; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -632,10 +632,10 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
; SKX_LARGE-NEXT: vpmovzxdq {{.*#+}} zmm1 = ymm1[0],zero,ymm1[1],zero,ymm1[2],zero,ymm1[3],zero,ymm1[4],zero,ymm1[5],zero,ymm1[6],zero,ymm1[7],zero
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpmuldq (%rax){1to8}, %zmm1, %zmm1
+; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm2, %zmm1
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpmullq (%rax){1to8}, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm2, %zmm0
-; SKX_LARGE-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; SKX_LARGE-NEXT: vpaddq %zmm0, %zmm1, %zmm1
; SKX_LARGE-NEXT: kxnorw %k0, %k0, %k1
; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: vpgatherqd 72(,%zmm1), %ymm0 {%k1}
@@ -643,9 +643,11 @@ define <8 x i32> @test10(ptr %base, <8 x i64> %i1, <8 x i32>%ind5) {
;
; SKX_32-LABEL: test10:
; SKX_32: # %bb.0: # %entry
-; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm1, %ymm1
+; SKX_32-NEXT: vpslld $4, %ymm1, %ymm2
; SKX_32-NEXT: vpmovqd %zmm0, %ymm0
; SKX_32-NEXT: vpmulld {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %ymm0
+; SKX_32-NEXT: vpslld $6, %ymm1, %ymm1
+; SKX_32-NEXT: vpaddd %ymm2, %ymm1, %ymm1
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to8}, %ymm0, %ymm0
; SKX_32-NEXT: vpaddd %ymm1, %ymm0, %ymm1
; SKX_32-NEXT: kxnorw %k0, %k0, %k1
@@ -816,8 +818,8 @@ define <16 x float> @test14(ptr %base, i32 %ind, <16 x ptr> %vec) {
;
; KNL_32-LABEL: test14:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: vmovd %xmm0, %eax
; KNL_32-NEXT: vbroadcastss {{[0-9]+}}(%esp), %zmm1
+; KNL_32-NEXT: vmovd %xmm0, %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -888,7 +890,6 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: jne .LBB14_7
; KNL_64-NEXT: .LBB14_8: # %else8
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB14_5: # %cond.load4
; KNL_64-NEXT: vmovq %xmm1, %rcx
@@ -898,7 +899,6 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; KNL_64-NEXT: .LBB14_7: # %cond.load7
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test15:
@@ -922,7 +922,6 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; KNL_32-NEXT: testb $8, %al
; KNL_32-NEXT: jne .LBB14_7
; KNL_32-NEXT: .LBB14_8: # %else8
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB14_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm1, %ecx
@@ -942,7 +941,6 @@ define <4 x float> @test15(ptr %base, <4 x i32> %ind, <4 x i1> %mask) {
; KNL_32-NEXT: .LBB14_7: # %cond.load7
; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0,1,2],mem[0]
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test15:
@@ -1106,7 +1104,6 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
; KNL_64-NEXT: jne .LBB16_3
; KNL_64-NEXT: .LBB16_4: # %else2
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB16_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -1117,7 +1114,6 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test17:
@@ -1135,7 +1131,6 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
; KNL_32-NEXT: jne .LBB16_3
; KNL_32-NEXT: .LBB16_4: # %else2
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB16_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -1146,7 +1141,6 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vmovhps {{.*#+}} xmm2 = xmm2[0,1],mem[0,1]
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test17:
@@ -1157,7 +1151,7 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB16_1
; SKX-NEXT: # %bb.2: # %else
@@ -1183,7 +1177,7 @@ define <2 x double> @test17(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x doub
; SKX_32-NEXT: vpmovq2m %xmm1, %k0
; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB16_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -1240,7 +1234,6 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: jne .LBB17_7
; KNL_64-NEXT: .LBB17_8: # %else6
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB17_5: # %cond.store3
; KNL_64-NEXT: vmovq %xmm1, %rcx
@@ -1250,7 +1243,6 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; KNL_64-NEXT: .LBB17_7: # %cond.store5
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
; KNL_64-NEXT: vextractps $3, %xmm0, (%rax)
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test18:
@@ -1270,7 +1262,6 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; KNL_32-NEXT: testb $8, %al
; KNL_32-NEXT: jne .LBB17_7
; KNL_32-NEXT: .LBB17_8: # %else6
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB17_1: # %cond.store
; KNL_32-NEXT: vmovd %xmm1, %ecx
@@ -1290,7 +1281,6 @@ define void @test18(<4 x i32>%a1, <4 x ptr> %ptr, <4 x i1>%mask) {
; KNL_32-NEXT: .LBB17_7: # %cond.store5
; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
; KNL_32-NEXT: vextractps $3, %xmm0, (%eax)
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test18:
@@ -1341,7 +1331,6 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: jne .LBB18_7
; KNL_64-NEXT: .LBB18_8: # %else6
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB18_5: # %cond.store3
; KNL_64-NEXT: vmovq %xmm1, %rcx
@@ -1351,7 +1340,6 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_64-NEXT: .LBB18_7: # %cond.store5
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
; KNL_64-NEXT: vmovhps %xmm0, (%rax)
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test19:
@@ -1383,7 +1371,6 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_32-NEXT: testb $8, %al
; KNL_32-NEXT: jne .LBB18_7
; KNL_32-NEXT: .LBB18_8: # %else6
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB18_5: # %cond.store3
; KNL_32-NEXT: vpextrd $2, %xmm1, %ecx
@@ -1393,7 +1380,6 @@ define void @test19(<4 x double>%a1, ptr %ptr, <4 x i1>%mask, <4 x i64> %ind) {
; KNL_32-NEXT: .LBB18_7: # %cond.store5
; KNL_32-NEXT: vpextrd $3, %xmm1, %eax
; KNL_32-NEXT: vmovhps %xmm0, (%eax)
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test19:
@@ -1430,7 +1416,6 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: jne .LBB19_3
; KNL_64-NEXT: .LBB19_4: # %else2
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB19_1: # %cond.store
; KNL_64-NEXT: vmovq %xmm1, %rcx
@@ -1440,7 +1425,6 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
; KNL_64-NEXT: .LBB19_3: # %cond.store1
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test20:
@@ -1454,7 +1438,6 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: jne .LBB19_3
; KNL_32-NEXT: .LBB19_4: # %else2
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB19_1: # %cond.store
; KNL_32-NEXT: vmovd %xmm1, %ecx
@@ -1464,14 +1447,13 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
; KNL_32-NEXT: .LBB19_3: # %cond.store1
; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test20:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vpmovq2m %xmm2, %k0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB19_1
; SKX-NEXT: # %bb.2: # %else
@@ -1493,7 +1475,7 @@ define void @test20(<2 x float>%a1, <2 x ptr> %ptr, <2 x i1> %mask) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX_32-NEXT: vpmovq2m %xmm2, %k0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB19_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -1527,7 +1509,6 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: jne .LBB20_3
; KNL_64-NEXT: .LBB20_4: # %else2
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB20_1: # %cond.store
; KNL_64-NEXT: vmovq %xmm1, %rcx
@@ -1537,7 +1518,6 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
; KNL_64-NEXT: .LBB20_3: # %cond.store1
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
; KNL_64-NEXT: vextractps $1, %xmm0, (%rax)
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test21:
@@ -1551,7 +1531,6 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: jne .LBB20_3
; KNL_32-NEXT: .LBB20_4: # %else2
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB20_1: # %cond.store
; KNL_32-NEXT: vmovd %xmm1, %ecx
@@ -1561,14 +1540,13 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
; KNL_32-NEXT: .LBB20_3: # %cond.store1
; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
; KNL_32-NEXT: vextractps $1, %xmm0, (%eax)
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test21:
; SKX: # %bb.0:
; SKX-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX-NEXT: vpmovq2m %xmm2, %k0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB20_1
; SKX-NEXT: # %bb.2: # %else
@@ -1590,7 +1568,7 @@ define void @test21(<2 x i32>%a1, <2 x ptr> %ptr, <2 x i1>%mask) {
; SKX_32: # %bb.0:
; SKX_32-NEXT: vpsllq $63, %xmm2, %xmm2
; SKX_32-NEXT: vpmovq2m %xmm2, %k0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB20_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -1632,7 +1610,6 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
; KNL_64-NEXT: jne .LBB21_3
; KNL_64-NEXT: .LBB21_4: # %else2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB21_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -1644,7 +1621,6 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22:
@@ -1662,7 +1638,6 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
; KNL_32-NEXT: jne .LBB21_3
; KNL_32-NEXT: .LBB21_4: # %else2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB21_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -1674,7 +1649,6 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22:
@@ -1685,7 +1659,7 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB21_1
; SKX-NEXT: # %bb.2: # %else
@@ -1712,7 +1686,7 @@ define <2 x float> @test22(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x float
; SKX_32-NEXT: vpmovq2m %xmm1, %k0
; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB21_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -1755,7 +1729,6 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
; KNL_64-NEXT: jne .LBB22_3
; KNL_64-NEXT: .LBB22_4: # %else2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB22_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -1767,7 +1740,6 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; KNL_64-NEXT: vmovaps %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test22a:
@@ -1786,7 +1758,6 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
; KNL_32-NEXT: jne .LBB22_3
; KNL_32-NEXT: .LBB22_4: # %else2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB22_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -1798,7 +1769,6 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vinsertps {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[2,3]
; KNL_32-NEXT: vmovaps %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test22a:
@@ -1808,7 +1778,7 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB22_1
; SKX-NEXT: # %bb.2: # %else
@@ -1836,7 +1806,7 @@ define <2 x float> @test22a(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x floa
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB22_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -1882,7 +1852,6 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
; KNL_64-NEXT: jne .LBB23_3
; KNL_64-NEXT: .LBB23_4: # %else2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB23_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -1893,7 +1862,6 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23:
@@ -1911,7 +1879,6 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
; KNL_32-NEXT: jne .LBB23_3
; KNL_32-NEXT: .LBB23_4: # %else2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB23_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -1922,7 +1889,6 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23:
@@ -1933,7 +1899,7 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB23_1
; SKX-NEXT: # %bb.2: # %else
@@ -1959,7 +1925,7 @@ define <2 x i32> @test23(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i32> %s
; SKX_32-NEXT: vpmovq2m %xmm1, %k0
; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB23_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -2001,7 +1967,6 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
; KNL_64-NEXT: jne .LBB24_3
; KNL_64-NEXT: .LBB24_4: # %else2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB24_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -2012,7 +1977,6 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm2, %xmm2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test23b:
@@ -2031,7 +1995,6 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
; KNL_32-NEXT: jne .LBB24_3
; KNL_32-NEXT: .LBB24_4: # %else2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB24_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -2042,7 +2005,6 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm2, %xmm2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test23b:
@@ -2052,7 +2014,7 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB24_1
; SKX-NEXT: # %bb.2: # %else
@@ -2079,7 +2041,7 @@ define <2 x i32> @test23b(ptr %base, <2 x i64> %ind, <2 x i1> %mask, <2 x i32> %
; SKX_32-NEXT: vpshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB24_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -2111,10 +2073,10 @@ define <2 x i32> @test24(ptr %base, <2 x i32> %ind) {
; KNL_64-NEXT: vmovq %rdi, %xmm1
; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; KNL_64-NEXT: vmovq %xmm0, %rax
-; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
+; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
+; KNL_64-NEXT: vmovq %xmm0, %rcx
; KNL_64-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL_64-NEXT: vpinsrd $1, (%rcx), %xmm0, %xmm0
+; KNL_64-NEXT: vpinsrd $1, (%rax), %xmm0, %xmm0
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test24:
@@ -2122,10 +2084,10 @@ define <2 x i32> @test24(ptr %base, <2 x i32> %ind) {
; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; KNL_32-NEXT: vmovd %xmm0, %eax
-; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
+; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
+; KNL_32-NEXT: vmovd %xmm0, %ecx
; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL_32-NEXT: vpinsrd $1, (%ecx), %xmm0, %xmm0
+; KNL_32-NEXT: vpinsrd $1, (%eax), %xmm0, %xmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test24:
@@ -2173,7 +2135,6 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
; KNL_64-NEXT: jne .LBB26_3
; KNL_64-NEXT: .LBB26_4: # %else2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB26_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -2184,7 +2145,6 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm2, %xmm2
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test25:
@@ -2202,7 +2162,6 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
; KNL_32-NEXT: jne .LBB26_3
; KNL_32-NEXT: .LBB26_4: # %else2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB26_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -2215,7 +2174,6 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm2, %xmm0
; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm2
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test25:
@@ -2226,7 +2184,7 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB26_1
; SKX-NEXT: # %bb.2: # %else
@@ -2252,7 +2210,7 @@ define <2 x i64> @test25(ptr %base, <2 x i32> %ind, <2 x i1> %mask, <2 x i64> %s
; SKX_32-NEXT: vpmovq2m %xmm1, %k0
; SKX_32-NEXT: vpslld $3, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB26_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -2287,8 +2245,8 @@ define <2 x i64> @test26(ptr %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_64-NEXT: vmovq %rdi, %xmm1
; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; KNL_64-NEXT: vmovq %xmm0, %rax
-; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
+; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
+; KNL_64-NEXT: vmovq %xmm0, %rcx
; KNL_64-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; KNL_64-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; KNL_64-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -2299,12 +2257,12 @@ define <2 x i64> @test26(ptr %base, <2 x i32> %ind, <2 x i64> %src0) {
; KNL_32-NEXT: vpslld $3, %xmm0, %xmm0
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; KNL_32-NEXT: vmovd %xmm0, %eax
-; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
+; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
+; KNL_32-NEXT: vmovd %xmm0, %ecx
; KNL_32-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
-; KNL_32-NEXT: vpinsrd $1, 4(%eax), %xmm0, %xmm0
-; KNL_32-NEXT: vpinsrd $2, (%ecx), %xmm0, %xmm0
-; KNL_32-NEXT: vpinsrd $3, 4(%ecx), %xmm0, %xmm0
+; KNL_32-NEXT: vpinsrd $1, 4(%ecx), %xmm0, %xmm0
+; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
+; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
; KNL_32-NEXT: retl
;
; SKX-LABEL: test26:
@@ -2313,8 +2271,8 @@ define <2 x i64> @test26(ptr %base, <2 x i32> %ind, <2 x i64> %src0) {
; SKX-NEXT: vpbroadcastq %rdi, %xmm1
; SKX-NEXT: vpsllq $3, %xmm0, %xmm0
; SKX-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; SKX-NEXT: vmovq %xmm0, %rax
-; SKX-NEXT: vpextrq $1, %xmm0, %rcx
+; SKX-NEXT: vpextrq $1, %xmm0, %rax
+; SKX-NEXT: vmovq %xmm0, %rcx
; SKX-NEXT: vmovsd {{.*#+}} xmm0 = mem[0],zero
; SKX-NEXT: vmovsd {{.*#+}} xmm1 = mem[0],zero
; SKX-NEXT: vmovlhps {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -2346,8 +2304,8 @@ define <2 x float> @test27(ptr %base, <2 x i32> %ind) {
; KNL_64-NEXT: vmovq %rdi, %xmm1
; KNL_64-NEXT: vpbroadcastq %xmm1, %xmm1
; KNL_64-NEXT: vpaddq %xmm0, %xmm1, %xmm0
-; KNL_64-NEXT: vmovq %xmm0, %rax
-; KNL_64-NEXT: vpextrq $1, %xmm0, %rcx
+; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
+; KNL_64-NEXT: vmovq %xmm0, %rcx
; KNL_64-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL_64-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; KNL_64-NEXT: retq
@@ -2357,8 +2315,8 @@ define <2 x float> @test27(ptr %base, <2 x i32> %ind) {
; KNL_32-NEXT: vpslld $2, %xmm0, %xmm0
; KNL_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm1
; KNL_32-NEXT: vpaddd %xmm0, %xmm1, %xmm0
-; KNL_32-NEXT: vmovd %xmm0, %eax
-; KNL_32-NEXT: vpextrd $1, %xmm0, %ecx
+; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
+; KNL_32-NEXT: vmovd %xmm0, %ecx
; KNL_32-NEXT: vmovss {{.*#+}} xmm0 = mem[0],zero,zero,zero
; KNL_32-NEXT: vinsertps {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[2,3]
; KNL_32-NEXT: retl
@@ -2458,7 +2416,7 @@ define <16 x float> @test29(ptr %base, <16 x i32> %ind) {
; SKX: # %bb.0:
; SKX-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX-NEXT: movw $44, %ax
-; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: vgatherdps (%rdi,%zmm0,4), %zmm1 {%k1}
; SKX-NEXT: vmovaps %zmm1, %zmm0
; SKX-NEXT: retq
@@ -2468,7 +2426,7 @@ define <16 x float> @test29(ptr %base, <16 x i32> %ind) {
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vxorps %xmm1, %xmm1, %xmm1
; SKX_32-NEXT: movw $44, %cx
-; SKX_32-NEXT: kmovw %ecx, %k1
+; SKX_32-NEXT: kmovd %ecx, %k1
; SKX_32-NEXT: vgatherdps (%eax,%zmm0,4), %zmm1 {%k1}
; SKX_32-NEXT: vmovaps %zmm1, %zmm0
; SKX_32-NEXT: retl
@@ -2511,7 +2469,6 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; KNL_64-NEXT: vpgatherqd (,%zmm0), %ymm2 {%k1}
; KNL_64-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test30:
@@ -2542,27 +2499,26 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; KNL_32-NEXT: vmovdqa %xmm2, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test30:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k0
+; SKX-NEXT: kmovd %esi, %k0
; SKX-NEXT: kshiftlb $7, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k0
-; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $7, %k1, %k1
; SKX-NEXT: korw %k0, %k1, %k0
; SKX-NEXT: movb $-5, %al
-; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kandw %k1, %k0, %k0
-; SKX-NEXT: kmovw %edx, %k1
+; SKX-NEXT: kmovd %edx, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $5, %k1, %k1
; SKX-NEXT: korw %k1, %k0, %k0
; SKX-NEXT: movb $7, %al
-; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kandw %k1, %k0, %k1
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
@@ -2574,25 +2530,22 @@ define <3 x i32> @test30(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i
;
; SKX_32-LABEL: test30:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k0
+; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; SKX_32-NEXT: kshiftlb $7, %k0, %k0
; SKX_32-NEXT: kshiftrb $6, %k0, %k0
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $7, %k1, %k1
; SKX_32-NEXT: korw %k0, %k1, %k0
; SKX_32-NEXT: movb $-5, %al
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovd %eax, %k1
; SKX_32-NEXT: kandw %k1, %k0, %k0
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $5, %k1, %k1
; SKX_32-NEXT: korw %k1, %k0, %k0
; SKX_32-NEXT: movb $7, %al
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovd %eax, %k1
; SKX_32-NEXT: kandw %k1, %k0, %k1
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2634,7 +2587,6 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
; KNL_64-NEXT: vpsllq $2, %ymm1, %ymm1
; KNL_64-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; KNL_64-NEXT: vpscatterqd %ymm2, (,%zmm0) {%k1}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test30b:
@@ -2664,27 +2616,26 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
; KNL_32-NEXT: vpslld $2, %xmm1, %xmm1
; KNL_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test30b:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k0
+; SKX-NEXT: kmovd %esi, %k0
; SKX-NEXT: kshiftlb $7, %k0, %k0
; SKX-NEXT: kshiftrb $6, %k0, %k0
-; SKX-NEXT: kmovw %edi, %k1
+; SKX-NEXT: kmovd %edi, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $7, %k1, %k1
; SKX-NEXT: korw %k0, %k1, %k0
; SKX-NEXT: movb $-5, %al
-; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kandw %k1, %k0, %k0
-; SKX-NEXT: kmovw %edx, %k1
+; SKX-NEXT: kmovd %edx, %k1
; SKX-NEXT: kshiftlb $7, %k1, %k1
; SKX-NEXT: kshiftrb $5, %k1, %k1
; SKX-NEXT: korw %k1, %k0, %k0
; SKX-NEXT: movb $7, %al
-; SKX-NEXT: kmovw %eax, %k1
+; SKX-NEXT: kmovd %eax, %k1
; SKX-NEXT: kandw %k1, %k0, %k1
; SKX-NEXT: vpmovsxdq %xmm1, %ymm1
; SKX-NEXT: vpsllq $2, %ymm1, %ymm1
@@ -2695,25 +2646,22 @@ define void @test30b(<3 x ptr> %base, <3 x i32> %ind, <3 x i1> %mask, <3 x i32>
;
; SKX_32-LABEL: test30b:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k0
+; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k0
; SKX_32-NEXT: kshiftlb $7, %k0, %k0
; SKX_32-NEXT: kshiftrb $6, %k0, %k0
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $7, %k1, %k1
; SKX_32-NEXT: korw %k0, %k1, %k0
; SKX_32-NEXT: movb $-5, %al
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovd %eax, %k1
; SKX_32-NEXT: kandw %k1, %k0, %k0
-; SKX_32-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovb {{[0-9]+}}(%esp), %k1
; SKX_32-NEXT: kshiftlb $7, %k1, %k1
; SKX_32-NEXT: kshiftrb $5, %k1, %k1
; SKX_32-NEXT: korw %k1, %k0, %k0
; SKX_32-NEXT: movb $7, %al
-; SKX_32-NEXT: kmovw %eax, %k1
+; SKX_32-NEXT: kmovd %eax, %k1
; SKX_32-NEXT: kandw %k1, %k0, %k1
; SKX_32-NEXT: vpslld $2, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd %xmm1, %xmm0, %xmm0
@@ -2751,12 +2699,12 @@ define <16 x ptr> @test31(<16 x ptr> %ptrs) {
; SKX: # %bb.0:
; SKX-NEXT: kxnorw %k0, %k0, %k1
; SKX-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
; SKX-NEXT: kxnorw %k0, %k0, %k2
-; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
-; SKX-NEXT: vpgatherqq (,%zmm1), %zmm2 {%k1}
-; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
-; SKX-NEXT: vmovdqa64 %zmm2, %zmm1
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm2 {%k2}
+; SKX-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm3 {%k1}
+; SKX-NEXT: vmovdqa64 %zmm2, %zmm0
+; SKX-NEXT: vmovdqa64 %zmm3, %zmm1
; SKX-NEXT: retq
;
; SKX_32-LABEL: test31:
@@ -2795,9 +2743,8 @@ define <16 x i32> @test_gather_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i3
;
; SKX-LABEL: test_gather_16i32:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vpgatherqd (,%zmm1), %ymm2 {%k2}
@@ -2807,9 +2754,8 @@ define <16 x i32> @test_gather_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i3
;
; SKX_32-LABEL: test_gather_16i32:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
; SKX_32-NEXT: vpgatherdd (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vmovdqa64 %zmm2, %zmm0
; SKX_32-NEXT: retl
@@ -2854,12 +2800,12 @@ define <16 x i64> @test_gather_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i6
;
; SKX-LABEL: test_gather_16i64:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
-; SKX-NEXT: kshiftrw $8, %k1, %k2
-; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k1}
-; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: kmovq %k1, %k2
+; SKX-NEXT: vpgatherqq (,%zmm0), %zmm3 {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vpgatherqq (,%zmm1), %zmm4 {%k1}
; SKX-NEXT: vmovdqa64 %zmm3, %zmm0
; SKX-NEXT: vmovdqa64 %zmm4, %zmm1
; SKX-NEXT: retq
@@ -2873,9 +2819,8 @@ define <16 x i64> @test_gather_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i6
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vpgatherdq (,%ymm0), %zmm2 {%k1}
@@ -2914,9 +2859,8 @@ define <16 x float> @test_gather_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x
;
; SKX-LABEL: test_gather_16f32:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm2
; SKX-NEXT: kshiftrw $8, %k1, %k2
; SKX-NEXT: vgatherqps (,%zmm1), %ymm2 {%k2}
@@ -2926,9 +2870,8 @@ define <16 x float> @test_gather_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x
;
; SKX_32-LABEL: test_gather_16f32:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
; SKX_32-NEXT: vgatherdps (,%zmm0), %zmm2 {%k1}
; SKX_32-NEXT: vmovaps %zmm2, %zmm0
; SKX_32-NEXT: retl
@@ -2973,12 +2916,12 @@ define <16 x double> @test_gather_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x
;
; SKX-LABEL: test_gather_16f64:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
-; SKX-NEXT: kshiftrw $8, %k1, %k2
-; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k1}
-; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k2}
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: kmovq %k1, %k2
+; SKX-NEXT: vgatherqpd (,%zmm0), %zmm3 {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vgatherqpd (,%zmm1), %zmm4 {%k1}
; SKX-NEXT: vmovapd %zmm3, %zmm0
; SKX-NEXT: vmovapd %zmm4, %zmm1
; SKX-NEXT: retq
@@ -2992,9 +2935,8 @@ define <16 x double> @test_gather_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
; SKX_32-NEXT: kshiftrw $8, %k1, %k2
; SKX_32-NEXT: vgatherdpd (,%ymm0), %zmm2 {%k1}
@@ -3019,7 +2961,6 @@ define void @test_scatter_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i32> %s
; KNL_64-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextracti64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i32:
@@ -3028,26 +2969,24 @@ define void @test_scatter_16i32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i32> %s
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i32:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
-; SKX-NEXT: kshiftrw $8, %k1, %k2
-; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: kmovq %k1, %k2
+; SKX-NEXT: vpscatterqd %ymm3, (,%zmm0) {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vextracti64x4 $1, %zmm3, %ymm0
-; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: vpscatterqd %ymm0, (,%zmm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16i32:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
; SKX_32-NEXT: vpscatterdd %zmm2, (,%zmm0) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
@@ -3063,7 +3002,6 @@ define void @test_scatter_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %s
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16i64:
@@ -3086,17 +3024,16 @@ define void @test_scatter_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %s
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: .cfi_def_cfa %esp, 4
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16i64:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
-; SKX-NEXT: kshiftrw $8, %k1, %k2
-; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k1}
-; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: kmovq %k1, %k2
+; SKX-NEXT: vpscatterqq %zmm3, (,%zmm0) {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vpscatterqq %zmm4, (,%zmm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
@@ -3109,14 +3046,14 @@ define void @test_scatter_16i64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x i64> %s
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
+; SKX_32-NEXT: kmovq %k1, %k2
+; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k2}
+; SKX_32-NEXT: kshiftrw $8, %k1, %k1
; SKX_32-NEXT: vmovdqa64 8(%ebp), %zmm1
-; SKX_32-NEXT: kshiftrw $8, %k1, %k2
-; SKX_32-NEXT: vpscatterdq %zmm2, (,%ymm0) {%k1}
; SKX_32-NEXT: vextracti64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k2}
+; SKX_32-NEXT: vpscatterdq %zmm1, (,%ymm0) {%k1}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: .cfi_def_cfa %esp, 4
@@ -3136,7 +3073,6 @@ define void @test_scatter_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float>
; KNL_64-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vextractf64x4 $1, %zmm3, %ymm0
; KNL_64-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f32:
@@ -3145,26 +3081,24 @@ define void @test_scatter_16f32(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x float>
; KNL_32-NEXT: vpslld $31, %zmm1, %zmm1
; KNL_32-NEXT: vptestmd %zmm1, %zmm1, %k1
; KNL_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f32:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
-; SKX-NEXT: kshiftrw $8, %k1, %k2
-; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k1}
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: kmovq %k1, %k2
+; SKX-NEXT: vscatterqps %ymm3, (,%zmm0) {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
; SKX-NEXT: vextractf64x4 $1, %zmm3, %ymm0
-; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k2}
+; SKX-NEXT: vscatterqps %ymm0, (,%zmm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
; SKX_32-LABEL: test_scatter_16f32:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
; SKX_32-NEXT: vscatterdps %zmm2, (,%zmm0) {%k1}
; SKX_32-NEXT: vzeroupper
; SKX_32-NEXT: retl
@@ -3181,7 +3115,6 @@ define void @test_scatter_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double>
; KNL_64-NEXT: kshiftrw $8, %k1, %k2
; KNL_64-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
; KNL_64-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_16f64:
@@ -3204,17 +3137,16 @@ define void @test_scatter_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double>
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: .cfi_def_cfa %esp, 4
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_16f64:
; SKX: # %bb.0:
-; SKX-NEXT: vpmovsxbd %xmm2, %zmm2
-; SKX-NEXT: vpslld $31, %zmm2, %zmm2
-; SKX-NEXT: vpmovd2m %zmm2, %k1
-; SKX-NEXT: kshiftrw $8, %k1, %k2
-; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k1}
-; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k2}
+; SKX-NEXT: vpsllw $7, %xmm2, %xmm2
+; SKX-NEXT: vpmovb2m %xmm2, %k1
+; SKX-NEXT: kmovq %k1, %k2
+; SKX-NEXT: vscatterqpd %zmm3, (,%zmm0) {%k2}
+; SKX-NEXT: kshiftrw $8, %k1, %k1
+; SKX-NEXT: vscatterqpd %zmm4, (,%zmm1) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
;
@@ -3227,14 +3159,14 @@ define void @test_scatter_16f64(<16 x ptr> %ptrs, <16 x i1> %mask, <16 x double>
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
-; SKX_32-NEXT: vpmovsxbd %xmm1, %zmm1
-; SKX_32-NEXT: vpslld $31, %zmm1, %zmm1
-; SKX_32-NEXT: vpmovd2m %zmm1, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm1, %xmm1
+; SKX_32-NEXT: vpmovb2m %xmm1, %k1
+; SKX_32-NEXT: kmovq %k1, %k2
+; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k2}
+; SKX_32-NEXT: kshiftrw $8, %k1, %k1
; SKX_32-NEXT: vmovapd 8(%ebp), %zmm1
-; SKX_32-NEXT: kshiftrw $8, %k1, %k2
-; SKX_32-NEXT: vscatterdpd %zmm2, (,%ymm0) {%k1}
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k2}
+; SKX_32-NEXT: vscatterdpd %zmm1, (,%ymm0) {%k1}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: .cfi_def_cfa %esp, 4
@@ -3713,7 +3645,6 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
; KNL_64-NEXT: jne .LBB47_3
; KNL_64-NEXT: .LBB47_4: # %else2
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB47_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -3725,7 +3656,6 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; KNL_64-NEXT: vmovaps %xmm1, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: large_index:
@@ -3745,7 +3675,6 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
; KNL_32-NEXT: jne .LBB47_3
; KNL_32-NEXT: .LBB47_4: # %else2
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB47_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -3757,7 +3686,6 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vinsertps {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[2,3]
; KNL_32-NEXT: vmovaps %xmm1, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: large_index:
@@ -3770,7 +3698,7 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
; SKX-NEXT: vpsllq $2, %xmm0, %xmm0
; SKX-NEXT: vpbroadcastq %rdi, %xmm2
; SKX-NEXT: vpaddq %xmm0, %xmm2, %xmm0
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB47_1
; SKX-NEXT: # %bb.2: # %else
@@ -3799,7 +3727,7 @@ define <2 x float> @large_index(ptr %base, <2 x i128> %ind, <2 x i1> %mask, <2 x
; SKX_32-NEXT: vpinsrd $1, {{[0-9]+}}(%esp), %xmm0, %xmm0
; SKX_32-NEXT: vpslld $2, %xmm0, %xmm0
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB47_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -4023,7 +3951,6 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: jne .LBB52_3
; KNL_64-NEXT: .LBB52_4: # %else2
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB52_1: # %cond.store
; KNL_64-NEXT: vmovq %xmm1, %rcx
@@ -4033,7 +3960,6 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
; KNL_64-NEXT: .LBB52_3: # %cond.store1
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
; KNL_64-NEXT: vmovhps %xmm0, (%rax)
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_2i32_index:
@@ -4050,7 +3976,6 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: jne .LBB52_3
; KNL_32-NEXT: .LBB52_4: # %else2
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB52_1: # %cond.store
; KNL_32-NEXT: vmovd %xmm1, %ecx
@@ -4060,7 +3985,6 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
; KNL_32-NEXT: .LBB52_3: # %cond.store1
; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
; KNL_32-NEXT: vmovhps %xmm0, (%eax)
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_2i32_index:
@@ -4071,7 +3995,7 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
; SKX-NEXT: vpmovsxdq %xmm1, %xmm1
; SKX-NEXT: vpsllq $3, %xmm1, %xmm1
; SKX-NEXT: vpaddq %xmm1, %xmm2, %xmm1
-; SKX-NEXT: kmovw %k0, %eax
+; SKX-NEXT: kmovd %k0, %eax
; SKX-NEXT: testb $1, %al
; SKX-NEXT: jne .LBB52_1
; SKX-NEXT: # %bb.2: # %else
@@ -4095,7 +4019,7 @@ define void @test_scatter_2i32_index(<2 x double> %a1, ptr %base, <2 x i32> %ind
; SKX_32-NEXT: vpmovq2m %xmm2, %k0
; SKX_32-NEXT: vpslld $3, %xmm1, %xmm1
; SKX_32-NEXT: vpaddd {{[0-9]+}}(%esp){1to4}, %xmm1, %xmm1
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB52_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -4129,8 +4053,8 @@ define <16 x float> @zext_index(ptr %base, <16 x i32> %ind) {
;
; KNL_32-LABEL: zext_index:
; KNL_32: # %bb.0:
-; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vpandd {{\.?LCPI[0-9]+_[0-9]+}}{1to16}, %zmm0, %zmm1
+; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kxnorw %k0, %k0, %k1
; KNL_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; KNL_32-NEXT: vgatherdps (%eax,%zmm1,4), %zmm0 {%k1}
@@ -4258,7 +4182,6 @@ define void @test_scatter_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cm
; KNL_64-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
; KNL_64-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; KNL_64-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: test_scatter_setcc_split:
@@ -4281,15 +4204,14 @@ define void @test_scatter_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cm
; KNL_32-NEXT: movl %ebp, %esp
; KNL_32-NEXT: popl %ebp
; KNL_32-NEXT: .cfi_def_cfa %esp, 4
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: test_scatter_setcc_split:
; SKX: # %bb.0:
-; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; SKX-NEXT: vptestnmd %ymm4, %ymm4, %k1
-; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k2
-; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k2}
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1
+; SKX-NEXT: vscatterdpd %zmm2, (%rdi,%ymm0,8) {%k1}
+; SKX-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; SKX-NEXT: vptestnmd %ymm1, %ymm1, %k1
; SKX-NEXT: vextractf64x4 $1, %zmm0, %ymm0
; SKX-NEXT: vscatterdpd %zmm3, (%rdi,%ymm0,8) {%k1}
; SKX-NEXT: vzeroupper
@@ -4304,14 +4226,14 @@ define void @test_scatter_setcc_split(ptr %base, <16 x i32> %ind, <16 x i32> %cm
; SKX_32-NEXT: .cfi_def_cfa_register %ebp
; SKX_32-NEXT: andl $-64, %esp
; SKX_32-NEXT: subl $64, %esp
-; SKX_32-NEXT: vmovapd 72(%ebp), %zmm3
; SKX_32-NEXT: movl 8(%ebp), %eax
-; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm4
-; SKX_32-NEXT: vptestnmd %ymm4, %ymm4, %k1
-; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k2
-; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k2}
+; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k1
+; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
+; SKX_32-NEXT: vmovapd 72(%ebp), %zmm2
+; SKX_32-NEXT: vextracti64x4 $1, %zmm1, %ymm1
+; SKX_32-NEXT: vptestnmd %ymm1, %ymm1, %k1
; SKX_32-NEXT: vextractf64x4 $1, %zmm0, %ymm0
-; SKX_32-NEXT: vscatterdpd %zmm3, (%eax,%ymm0,8) {%k1}
+; SKX_32-NEXT: vscatterdpd %zmm2, (%eax,%ymm0,8) {%k1}
; SKX_32-NEXT: movl %ebp, %esp
; SKX_32-NEXT: popl %ebp
; SKX_32-NEXT: .cfi_def_cfa %esp, 4
@@ -4404,7 +4326,6 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: jne .LBB58_3
; KNL_64-NEXT: .LBB58_4: # %else2
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB58_1: # %cond.load
; KNL_64-NEXT: vmovq %xmm1, %rcx
@@ -4414,7 +4335,6 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
; KNL_64-NEXT: .LBB58_3: # %cond.load1
; KNL_64-NEXT: vpextrq $1, %xmm1, %rax
; KNL_64-NEXT: vpinsrq $1, (%rax), %xmm0, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: gather_2i64_constant_indices:
@@ -4431,7 +4351,6 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: jne .LBB58_3
; KNL_32-NEXT: .LBB58_4: # %else2
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB58_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm1, %ecx
@@ -4442,7 +4361,6 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
; KNL_32-NEXT: vpextrd $1, %xmm1, %eax
; KNL_32-NEXT: vpinsrd $2, (%eax), %xmm0, %xmm0
; KNL_32-NEXT: vpinsrd $3, 4(%eax), %xmm0, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX_SMALL-LABEL: gather_2i64_constant_indices:
@@ -4451,7 +4369,7 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0
; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
-; SKX_SMALL-NEXT: kmovw %k0, %eax
+; SKX_SMALL-NEXT: kmovd %k0, %eax
; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: testb $1, %al
; SKX_SMALL-NEXT: jne .LBB58_1
@@ -4477,7 +4395,7 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm1
-; SKX_LARGE-NEXT: kmovw %k0, %eax
+; SKX_LARGE-NEXT: kmovd %k0, %eax
; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_LARGE-NEXT: testb $1, %al
; SKX_LARGE-NEXT: jne .LBB58_1
@@ -4502,7 +4420,7 @@ define <2 x i64> @gather_2i64_constant_indices(ptr %ptr, <2 x i1> %mask) {
; SKX_32-NEXT: vpmovq2m %xmm0, %k0
; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm1
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB58_1
@@ -4550,9 +4468,8 @@ define <16 x i32> @gather_16i64_constant_indices(ptr %ptr, <16 x i1> %mask) {
;
; SKX_SMALL-LABEL: gather_16i64_constant_indices:
; SKX_SMALL: # %bb.0:
-; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
-; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
+; SKX_SMALL-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX_SMALL-NEXT: vpmovb2m %xmm0, %k1
; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
; SKX_SMALL-NEXT: vpxor %xmm0, %xmm0, %xmm0
; SKX_SMALL-NEXT: vpgatherdd (%rdi,%zmm1,4), %zmm0 {%k1}
@@ -4560,9 +4477,8 @@ define <16 x i32> @gather_16i64_constant_indices(ptr %ptr, <16 x i1> %mask) {
;
; SKX_LARGE-LABEL: gather_16i64_constant_indices:
; SKX_LARGE: # %bb.0:
-; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
-; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
+; SKX_LARGE-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX_LARGE-NEXT: vpmovb2m %xmm0, %k1
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm1
; SKX_LARGE-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -4571,9 +4487,8 @@ define <16 x i32> @gather_16i64_constant_indices(ptr %ptr, <16 x i1> %mask) {
;
; SKX_32-LABEL: gather_16i64_constant_indices:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
-; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX_32-NEXT: vpmovd2m %zmm0, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX_32-NEXT: vpmovb2m %xmm0, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm1 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
; SKX_32-NEXT: vpxor %xmm0, %xmm0, %xmm0
@@ -4599,7 +4514,6 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
; KNL_64-NEXT: testb $2, %al
; KNL_64-NEXT: jne .LBB60_3
; KNL_64-NEXT: .LBB60_4: # %else2
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB60_1: # %cond.store
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -4609,7 +4523,6 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
; KNL_64-NEXT: .LBB60_3: # %cond.store1
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vextractps $1, %xmm1, (%rax)
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: scatter_2i64_constant_indices:
@@ -4625,7 +4538,6 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
; KNL_32-NEXT: testb $2, %al
; KNL_32-NEXT: jne .LBB60_3
; KNL_32-NEXT: .LBB60_4: # %else2
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB60_1: # %cond.store
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -4635,7 +4547,6 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
; KNL_32-NEXT: .LBB60_3: # %cond.store1
; KNL_32-NEXT: vpextrd $1, %xmm0, %eax
; KNL_32-NEXT: vextractps $1, %xmm1, (%eax)
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX_SMALL-LABEL: scatter_2i64_constant_indices:
@@ -4644,7 +4555,7 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
; SKX_SMALL-NEXT: vpmovq2m %xmm0, %k0
; SKX_SMALL-NEXT: vpbroadcastq %rdi, %xmm0
; SKX_SMALL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; SKX_SMALL-NEXT: kmovw %k0, %eax
+; SKX_SMALL-NEXT: kmovd %k0, %eax
; SKX_SMALL-NEXT: testb $1, %al
; SKX_SMALL-NEXT: jne .LBB60_1
; SKX_SMALL-NEXT: # %bb.2: # %else
@@ -4669,7 +4580,7 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
; SKX_LARGE-NEXT: vpbroadcastq %rdi, %xmm0
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vpaddq (%rax), %xmm0, %xmm0
-; SKX_LARGE-NEXT: kmovw %k0, %eax
+; SKX_LARGE-NEXT: kmovd %k0, %eax
; SKX_LARGE-NEXT: testb $1, %al
; SKX_LARGE-NEXT: jne .LBB60_1
; SKX_LARGE-NEXT: # %bb.2: # %else
@@ -4693,7 +4604,7 @@ define void @scatter_2i64_constant_indices(ptr %ptr, <2 x i1> %mask, <2 x i32> %
; SKX_32-NEXT: vpmovq2m %xmm0, %k0
; SKX_32-NEXT: vpbroadcastd {{[0-9]+}}(%esp), %xmm0
; SKX_32-NEXT: vpaddd {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0, %xmm0
-; SKX_32-NEXT: kmovw %k0, %eax
+; SKX_32-NEXT: kmovd %k0, %eax
; SKX_32-NEXT: testb $1, %al
; SKX_32-NEXT: jne .LBB60_1
; SKX_32-NEXT: # %bb.2: # %else
@@ -4723,7 +4634,6 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
; KNL_64-NEXT: vptestmd %zmm0, %zmm0, %k1
; KNL_64-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
; KNL_64-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: scatter_16i64_constant_indices:
@@ -4734,14 +4644,12 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
; KNL_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX_SMALL-LABEL: scatter_16i64_constant_indices:
; SKX_SMALL: # %bb.0:
-; SKX_SMALL-NEXT: vpmovsxbd %xmm0, %zmm0
-; SKX_SMALL-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX_SMALL-NEXT: vpmovd2m %zmm0, %k1
+; SKX_SMALL-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX_SMALL-NEXT: vpmovb2m %xmm0, %k1
; SKX_SMALL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
; SKX_SMALL-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
; SKX_SMALL-NEXT: vzeroupper
@@ -4749,9 +4657,8 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
;
; SKX_LARGE-LABEL: scatter_16i64_constant_indices:
; SKX_LARGE: # %bb.0:
-; SKX_LARGE-NEXT: vpmovsxbd %xmm0, %zmm0
-; SKX_LARGE-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX_LARGE-NEXT: vpmovd2m %zmm0, %k1
+; SKX_LARGE-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX_LARGE-NEXT: vpmovb2m %xmm0, %k1
; SKX_LARGE-NEXT: movabsq ${{\.?LCPI[0-9]+_[0-9]+}}, %rax
; SKX_LARGE-NEXT: vmovdqa64 (%rax), %zmm0
; SKX_LARGE-NEXT: vpscatterdd %zmm1, (%rdi,%zmm0,4) {%k1}
@@ -4760,9 +4667,8 @@ define void @scatter_16i64_constant_indices(ptr %ptr, <16 x i1> %mask, <16 x i32
;
; SKX_32-LABEL: scatter_16i64_constant_indices:
; SKX_32: # %bb.0:
-; SKX_32-NEXT: vpmovsxbd %xmm0, %zmm0
-; SKX_32-NEXT: vpslld $31, %zmm0, %zmm0
-; SKX_32-NEXT: vpmovd2m %zmm0, %k1
+; SKX_32-NEXT: vpsllw $7, %xmm0, %xmm0
+; SKX_32-NEXT: vpmovb2m %xmm0, %k1
; SKX_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; SKX_32-NEXT: vmovdqa64 {{.*#+}} zmm0 = [0,4294967294,1,4294967288,10,20,50,65536,16777215,2147483647,100,4294965296,2147483648,76897723,7,4227069609]
; SKX_32-NEXT: vpscatterdd %zmm1, (%eax,%zmm0,4) {%k1}
@@ -4801,7 +4707,6 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
; KNL_64-NEXT: jne .LBB62_7
; KNL_64-NEXT: .LBB62_8: # %else8
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB62_5: # %cond.load4
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -4812,7 +4717,6 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vpinsrd $3, (%rax), %xmm1, %xmm1
; KNL_64-NEXT: vmovdqa %xmm1, %xmm0
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: splat_ptr_gather:
@@ -4834,7 +4738,6 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
; KNL_32-NEXT: jne .LBB62_7
; KNL_32-NEXT: .LBB62_8: # %else8
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB62_1: # %cond.load
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -4855,7 +4758,6 @@ define <4 x i32> @splat_ptr_gather(ptr %ptr, <4 x i1> %mask, <4 x i32> %passthru
; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
; KNL_32-NEXT: vpinsrd $3, (%eax), %xmm1, %xmm1
; KNL_32-NEXT: vmovdqa %xmm1, %xmm0
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: splat_ptr_gather:
@@ -4910,7 +4812,6 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; KNL_64-NEXT: testb $8, %al
; KNL_64-NEXT: jne .LBB63_7
; KNL_64-NEXT: .LBB63_8: # %else6
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
; KNL_64-NEXT: .LBB63_5: # %cond.store3
; KNL_64-NEXT: vmovq %xmm0, %rcx
@@ -4920,7 +4821,6 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; KNL_64-NEXT: .LBB63_7: # %cond.store5
; KNL_64-NEXT: vpextrq $1, %xmm0, %rax
; KNL_64-NEXT: vextractps $3, %xmm1, (%rax)
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: splat_ptr_scatter:
@@ -4941,7 +4841,6 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; KNL_32-NEXT: testb $8, %al
; KNL_32-NEXT: jne .LBB63_7
; KNL_32-NEXT: .LBB63_8: # %else6
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
; KNL_32-NEXT: .LBB63_1: # %cond.store
; KNL_32-NEXT: vmovd %xmm0, %ecx
@@ -4961,7 +4860,6 @@ define void @splat_ptr_scatter(ptr %ptr, <4 x i1> %mask, <4 x i32> %val) {
; KNL_32-NEXT: .LBB63_7: # %cond.store5
; KNL_32-NEXT: vpextrd $3, %xmm0, %eax
; KNL_32-NEXT: vextractps $3, %xmm1, (%eax)
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: splat_ptr_scatter:
@@ -5070,7 +4968,6 @@ define void @scaleidx_x86scatter(<16 x float> %value, ptr %base, <16 x i32> %ind
; KNL_64-NEXT: kmovw %esi, %k1
; KNL_64-NEXT: vpaddd %zmm1, %zmm1, %zmm1
; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,2) {%k1}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: scaleidx_x86scatter:
@@ -5078,12 +4975,11 @@ define void @scaleidx_x86scatter(<16 x float> %value, ptr %base, <16 x i32> %ind
; KNL_32-NEXT: movl {{[0-9]+}}(%esp), %eax
; KNL_32-NEXT: kmovw {{[0-9]+}}(%esp), %k1
; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: scaleidx_x86scatter:
; SKX: # %bb.0:
-; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: vpaddd %zmm1, %zmm1, %zmm1
; SKX-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,2) {%k1}
; SKX-NEXT: vzeroupper
@@ -5112,7 +5008,6 @@ define void @scaleidx_scatter(<8 x float> %value, ptr %base, <8 x i32> %index, i
; KNL_64-NEXT: kshiftlw $8, %k0, %k0
; KNL_64-NEXT: kshiftrw $8, %k0, %k1
; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: scaleidx_scatter:
@@ -5125,13 +5020,12 @@ define void @scaleidx_scatter(<8 x float> %value, ptr %base, <8 x i32> %index, i
; KNL_32-NEXT: kshiftlw $8, %k0, %k0
; KNL_32-NEXT: kshiftrw $8, %k0, %k1
; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: scaleidx_scatter:
; SKX: # %bb.0:
; SKX-NEXT: vpaddd %ymm1, %ymm1, %ymm1
-; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: vscatterdps %ymm0, (%rdi,%ymm1,4) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
@@ -5159,7 +5053,6 @@ define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32
; KNL_64-NEXT: kshiftlw $8, %k0, %k0
; KNL_64-NEXT: kshiftrw $8, %k0, %k1
; KNL_64-NEXT: vscatterdps %zmm0, (%rdi,%zmm1,4) {%k1}
-; KNL_64-NEXT: vzeroupper
; KNL_64-NEXT: retq
;
; KNL_32-LABEL: scaleidx_scatter_outofrange:
@@ -5172,13 +5065,12 @@ define void @scaleidx_scatter_outofrange(<8 x float> %value, ptr %base, <8 x i32
; KNL_32-NEXT: kshiftlw $8, %k0, %k0
; KNL_32-NEXT: kshiftrw $8, %k0, %k1
; KNL_32-NEXT: vscatterdps %zmm0, (%eax,%zmm1,4) {%k1}
-; KNL_32-NEXT: vzeroupper
; KNL_32-NEXT: retl
;
; SKX-LABEL: scaleidx_scatter_outofrange:
; SKX: # %bb.0:
; SKX-NEXT: vpslld $2, %ymm1, %ymm1
-; SKX-NEXT: kmovw %esi, %k1
+; SKX-NEXT: kmovd %esi, %k1
; SKX-NEXT: vscatterdps %ymm0, (%rdi,%ymm1,4) {%k1}
; SKX-NEXT: vzeroupper
; SKX-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
index ee5fd78c643793..a4884ab5bf6fc5 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs-x32.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse | FileCheck %s --check-prefixes=X86,X86-SSE1
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86,X86-SSE41
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mcpu=generic -mattr=cmov | FileCheck %s --check-prefixes=X86,X86-NOSSE
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+sse | FileCheck %s --check-prefixes=X86,X86-SSE1
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+sse2 | FileCheck %s --check-prefixes=X86,X86-SSE2
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=i686-unknown-unknown -mcpu=generic -mattr=+sse4.1 | FileCheck %s --check-prefixes=X86,X86-SSE41
; This tests codegen time inlining/optimization of memcmp
; rdar://6480398
@@ -78,8 +78,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movzwl (%ecx), %ecx
-; X86-NEXT: movzwl (%eax), %edx
; X86-NEXT: rolw $8, %cx
+; X86-NEXT: movzwl (%eax), %edx
; X86-NEXT: rolw $8, %dx
; X86-NEXT: movzwl %cx, %eax
; X86-NEXT: movzwl %dx, %ecx
@@ -147,8 +147,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzwl (%eax), %edx
-; X86-NEXT: movzwl (%ecx), %esi
; X86-NEXT: rolw $8, %dx
+; X86-NEXT: movzwl (%ecx), %esi
; X86-NEXT: rolw $8, %si
; X86-NEXT: cmpw %si, %dx
; X86-NEXT: jne .LBB9_3
@@ -549,9 +549,9 @@ define i1 @length11_eq(ptr %X, ptr %Y) nounwind {
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 4(%eax), %esi
-; X86-NEXT: orl %edx, %esi
; X86-NEXT: movl 7(%ecx), %ecx
; X86-NEXT: xorl 7(%eax), %ecx
+; X86-NEXT: orl %edx, %esi
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
@@ -571,9 +571,9 @@ define i1 @length12_eq(ptr %X, ptr %Y) nounwind {
; X86-NEXT: movl 4(%ecx), %esi
; X86-NEXT: xorl (%eax), %edx
; X86-NEXT: xorl 4(%eax), %esi
-; X86-NEXT: orl %edx, %esi
; X86-NEXT: movl 8(%ecx), %ecx
; X86-NEXT: xorl 8(%eax), %ecx
+; X86-NEXT: orl %edx, %esi
; X86-NEXT: orl %esi, %ecx
; X86-NEXT: setne %al
; X86-NEXT: popl %esi
@@ -677,6 +677,7 @@ define i1 @length14_eq(ptr %X, ptr %Y) nounwind {
define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
; X86-LABEL: length15_eq:
; X86: # %bb.0:
+; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -684,15 +685,16 @@ define i1 @length15_eq(ptr %X, ptr %Y) nounwind {
; X86-NEXT: movl 4(%edx), %eax
; X86-NEXT: xorl (%ecx), %esi
; X86-NEXT: xorl 4(%ecx), %eax
+; X86-NEXT: movl 8(%edx), %edi
+; X86-NEXT: xorl 8(%ecx), %edi
; X86-NEXT: orl %esi, %eax
-; X86-NEXT: movl 8(%edx), %esi
-; X86-NEXT: xorl 8(%ecx), %esi
; X86-NEXT: movl 11(%edx), %edx
; X86-NEXT: xorl 11(%ecx), %edx
-; X86-NEXT: orl %esi, %edx
+; X86-NEXT: orl %edi, %edx
; X86-NEXT: orl %eax, %edx
; X86-NEXT: sete %al
; X86-NEXT: popl %esi
+; X86-NEXT: popl %edi
; X86-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr %Y, i32 15) nounwind
%c = icmp eq i32 %m, 0
@@ -750,6 +752,7 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
define i1 @length16_eq(ptr %x, ptr %y) nounwind {
; X86-NOSSE-LABEL: length16_eq:
; X86-NOSSE: # %bb.0:
+; X86-NOSSE-NEXT: pushl %edi
; X86-NOSSE-NEXT: pushl %esi
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -757,19 +760,21 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
; X86-NOSSE-NEXT: movl 4(%edx), %eax
; X86-NOSSE-NEXT: xorl (%ecx), %esi
; X86-NOSSE-NEXT: xorl 4(%ecx), %eax
+; X86-NOSSE-NEXT: movl 8(%edx), %edi
+; X86-NOSSE-NEXT: xorl 8(%ecx), %edi
; X86-NOSSE-NEXT: orl %esi, %eax
-; X86-NOSSE-NEXT: movl 8(%edx), %esi
-; X86-NOSSE-NEXT: xorl 8(%ecx), %esi
; X86-NOSSE-NEXT: movl 12(%edx), %edx
; X86-NOSSE-NEXT: xorl 12(%ecx), %edx
-; X86-NOSSE-NEXT: orl %esi, %edx
+; X86-NOSSE-NEXT: orl %edi, %edx
; X86-NOSSE-NEXT: orl %eax, %edx
; X86-NOSSE-NEXT: setne %al
; X86-NOSSE-NEXT: popl %esi
+; X86-NOSSE-NEXT: popl %edi
; X86-NOSSE-NEXT: retl
;
; X86-SSE1-LABEL: length16_eq:
; X86-SSE1: # %bb.0:
+; X86-SSE1-NEXT: pushl %edi
; X86-SSE1-NEXT: pushl %esi
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %edx
@@ -777,15 +782,16 @@ define i1 @length16_eq(ptr %x, ptr %y) nounwind {
; X86-SSE1-NEXT: movl 4(%edx), %eax
; X86-SSE1-NEXT: xorl (%ecx), %esi
; X86-SSE1-NEXT: xorl 4(%ecx), %eax
+; X86-SSE1-NEXT: movl 8(%edx), %edi
+; X86-SSE1-NEXT: xorl 8(%ecx), %edi
; X86-SSE1-NEXT: orl %esi, %eax
-; X86-SSE1-NEXT: movl 8(%edx), %esi
-; X86-SSE1-NEXT: xorl 8(%ecx), %esi
; X86-SSE1-NEXT: movl 12(%edx), %edx
; X86-SSE1-NEXT: xorl 12(%ecx), %edx
-; X86-SSE1-NEXT: orl %esi, %edx
+; X86-SSE1-NEXT: orl %edi, %edx
; X86-SSE1-NEXT: orl %eax, %edx
; X86-SSE1-NEXT: setne %al
; X86-SSE1-NEXT: popl %esi
+; X86-SSE1-NEXT: popl %edi
; X86-SSE1-NEXT: retl
;
; X86-SSE2-LABEL: length16_eq:
@@ -918,17 +924,17 @@ define i1 @length16_eq_const(ptr %X) nounwind {
; X86-NOSSE: # %bb.0:
; X86-NOSSE-NEXT: pushl %esi
; X86-NOSSE-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT: movl $858927408, %ecx # imm = 0x33323130
-; X86-NOSSE-NEXT: xorl (%eax), %ecx
-; X86-NOSSE-NEXT: movl $926299444, %edx # imm = 0x37363534
-; X86-NOSSE-NEXT: xorl 4(%eax), %edx
+; X86-NOSSE-NEXT: movl $858927408, %edx # imm = 0x33323130
+; X86-NOSSE-NEXT: xorl (%eax), %edx
+; X86-NOSSE-NEXT: movl $926299444, %ecx # imm = 0x37363534
+; X86-NOSSE-NEXT: xorl 4(%eax), %ecx
+; X86-NOSSE-NEXT: movl $825243960, %esi # imm = 0x31303938
+; X86-NOSSE-NEXT: xorl 8(%eax), %esi
+; X86-NOSSE-NEXT: orl %edx, %ecx
+; X86-NOSSE-NEXT: movl $892613426, %edx # imm = 0x35343332
+; X86-NOSSE-NEXT: xorl 12(%eax), %edx
+; X86-NOSSE-NEXT: orl %esi, %edx
; X86-NOSSE-NEXT: orl %ecx, %edx
-; X86-NOSSE-NEXT: movl $825243960, %ecx # imm = 0x31303938
-; X86-NOSSE-NEXT: xorl 8(%eax), %ecx
-; X86-NOSSE-NEXT: movl $892613426, %esi # imm = 0x35343332
-; X86-NOSSE-NEXT: xorl 12(%eax), %esi
-; X86-NOSSE-NEXT: orl %ecx, %esi
-; X86-NOSSE-NEXT: orl %edx, %esi
; X86-NOSSE-NEXT: sete %al
; X86-NOSSE-NEXT: popl %esi
; X86-NOSSE-NEXT: retl
@@ -937,17 +943,17 @@ define i1 @length16_eq_const(ptr %X) nounwind {
; X86-SSE1: # %bb.0:
; X86-SSE1-NEXT: pushl %esi
; X86-SSE1-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE1-NEXT: movl $858927408, %ecx # imm = 0x33323130
-; X86-SSE1-NEXT: xorl (%eax), %ecx
-; X86-SSE1-NEXT: movl $926299444, %edx # imm = 0x37363534
-; X86-SSE1-NEXT: xorl 4(%eax), %edx
+; X86-SSE1-NEXT: movl $858927408, %edx # imm = 0x33323130
+; X86-SSE1-NEXT: xorl (%eax), %edx
+; X86-SSE1-NEXT: movl $926299444, %ecx # imm = 0x37363534
+; X86-SSE1-NEXT: xorl 4(%eax), %ecx
+; X86-SSE1-NEXT: movl $825243960, %esi # imm = 0x31303938
+; X86-SSE1-NEXT: xorl 8(%eax), %esi
+; X86-SSE1-NEXT: orl %edx, %ecx
+; X86-SSE1-NEXT: movl $892613426, %edx # imm = 0x35343332
+; X86-SSE1-NEXT: xorl 12(%eax), %edx
+; X86-SSE1-NEXT: orl %esi, %edx
; X86-SSE1-NEXT: orl %ecx, %edx
-; X86-SSE1-NEXT: movl $825243960, %ecx # imm = 0x31303938
-; X86-SSE1-NEXT: xorl 8(%eax), %ecx
-; X86-SSE1-NEXT: movl $892613426, %esi # imm = 0x35343332
-; X86-SSE1-NEXT: xorl 12(%eax), %esi
-; X86-SSE1-NEXT: orl %ecx, %esi
-; X86-SSE1-NEXT: orl %edx, %esi
; X86-SSE1-NEXT: sete %al
; X86-SSE1-NEXT: popl %esi
; X86-SSE1-NEXT: retl
@@ -1106,12 +1112,12 @@ define i1 @length24_eq_const(ptr %X) nounwind {
; X86-SSE2-LABEL: length24_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movdqu (%eax), %xmm0
-; X86-SSE2-NEXT: movdqu 8(%eax), %xmm1
-; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: movdqu 8(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
-; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT: movdqu (%eax), %xmm1
+; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
@@ -1119,12 +1125,12 @@ define i1 @length24_eq_const(ptr %X) nounwind {
; X86-SSE41-LABEL: length24_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movdqu (%eax), %xmm0
-; X86-SSE41-NEXT: movdqu 8(%eax), %xmm1
-; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: movdqu 8(%eax), %xmm0
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT: por %xmm1, %xmm0
-; X86-SSE41-NEXT: ptest %xmm0, %xmm0
+; X86-SSE41-NEXT: movdqu (%eax), %xmm1
+; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: por %xmm0, %xmm1
+; X86-SSE41-NEXT: ptest %xmm1, %xmm1
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 24) nounwind
@@ -1319,12 +1325,12 @@ define i1 @length31_eq_const(ptr %X) nounwind {
; X86-SSE2-LABEL: length31_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movdqu (%eax), %xmm0
-; X86-SSE2-NEXT: movdqu 15(%eax), %xmm1
-; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: movdqu 15(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
-; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT: movdqu (%eax), %xmm1
+; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
@@ -1332,12 +1338,12 @@ define i1 @length31_eq_const(ptr %X) nounwind {
; X86-SSE41-LABEL: length31_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movdqu (%eax), %xmm0
-; X86-SSE41-NEXT: movdqu 15(%eax), %xmm1
-; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: movdqu 15(%eax), %xmm0
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT: por %xmm1, %xmm0
-; X86-SSE41-NEXT: ptest %xmm0, %xmm0
+; X86-SSE41-NEXT: movdqu (%eax), %xmm1
+; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: por %xmm0, %xmm1
+; X86-SSE41-NEXT: ptest %xmm1, %xmm1
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 31) nounwind
@@ -1534,12 +1540,12 @@ define i1 @length32_eq_const(ptr %X) nounwind {
; X86-SSE2-LABEL: length32_eq_const:
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT: movdqu (%eax), %xmm0
-; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE2-NEXT: pand %xmm1, %xmm0
-; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X86-SSE2-NEXT: movdqu (%eax), %xmm1
+; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: pand %xmm0, %xmm1
+; X86-SSE2-NEXT: pmovmskb %xmm1, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: setne %al
; X86-SSE2-NEXT: retl
@@ -1547,12 +1553,12 @@ define i1 @length32_eq_const(ptr %X) nounwind {
; X86-SSE41-LABEL: length32_eq_const:
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-SSE41-NEXT: movdqu (%eax), %xmm0
-; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: movdqu 16(%eax), %xmm0
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
-; X86-SSE41-NEXT: por %xmm1, %xmm0
-; X86-SSE41-NEXT: ptest %xmm0, %xmm0
+; X86-SSE41-NEXT: movdqu (%eax), %xmm1
+; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: por %xmm0, %xmm1
+; X86-SSE41-NEXT: ptest %xmm1, %xmm1
; X86-SSE41-NEXT: setne %al
; X86-SSE41-NEXT: retl
%m = tail call i32 @memcmp(ptr %X, ptr @.str, i32 32) nounwind
@@ -1941,16 +1947,16 @@ define i1 @length63_eq_const(ptr %X) nounwind {
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
-; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT: movdqu 47(%eax), %xmm3
-; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT: movdqu 32(%eax), %xmm1
+; X86-SSE2-NEXT: movdqu 47(%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm3
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
-; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
@@ -1960,16 +1966,16 @@ define i1 @length63_eq_const(ptr %X) nounwind {
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
-; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT: movdqu 47(%eax), %xmm3
-; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE41-NEXT: movdqu 32(%eax), %xmm1
+; X86-SSE41-NEXT: movdqu 47(%eax), %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT: por %xmm3, %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: movdqu 16(%eax), %xmm3
+; X86-SSE41-NEXT: por %xmm2, %xmm1
+; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT: por %xmm3, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
-; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
@@ -2124,16 +2130,16 @@ define i1 @length64_eq_const(ptr %X) nounwind {
; X86-SSE2: # %bb.0:
; X86-SSE2-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE2-NEXT: movdqu (%eax), %xmm0
-; X86-SSE2-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE2-NEXT: movdqu 32(%eax), %xmm2
-; X86-SSE2-NEXT: movdqu 48(%eax), %xmm3
-; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE2-NEXT: movdqu 32(%eax), %xmm1
+; X86-SSE2-NEXT: movdqu 48(%eax), %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE2-NEXT: pand %xmm3, %xmm2
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE2-NEXT: movdqu 16(%eax), %xmm3
+; X86-SSE2-NEXT: pand %xmm2, %xmm1
+; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE2-NEXT: pand %xmm3, %xmm0
; X86-SSE2-NEXT: pand %xmm1, %xmm0
-; X86-SSE2-NEXT: pand %xmm2, %xmm0
; X86-SSE2-NEXT: pmovmskb %xmm0, %eax
; X86-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X86-SSE2-NEXT: sete %al
@@ -2143,16 +2149,16 @@ define i1 @length64_eq_const(ptr %X) nounwind {
; X86-SSE41: # %bb.0:
; X86-SSE41-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-SSE41-NEXT: movdqu (%eax), %xmm0
-; X86-SSE41-NEXT: movdqu 16(%eax), %xmm1
-; X86-SSE41-NEXT: movdqu 32(%eax), %xmm2
-; X86-SSE41-NEXT: movdqu 48(%eax), %xmm3
-; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
+; X86-SSE41-NEXT: movdqu 32(%eax), %xmm1
+; X86-SSE41-NEXT: movdqu 48(%eax), %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm2
-; X86-SSE41-NEXT: por %xmm3, %xmm2
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm1
+; X86-SSE41-NEXT: movdqu 16(%eax), %xmm3
+; X86-SSE41-NEXT: por %xmm2, %xmm1
+; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm3
; X86-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}, %xmm0
+; X86-SSE41-NEXT: por %xmm3, %xmm0
; X86-SSE41-NEXT: por %xmm1, %xmm0
-; X86-SSE41-NEXT: por %xmm2, %xmm0
; X86-SSE41-NEXT: ptest %xmm0, %xmm0
; X86-SSE41-NEXT: sete %al
; X86-SSE41-NEXT: retl
diff --git a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
index a46f9ed3d3798d..4b782ba60035da 100644
--- a/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
+++ b/llvm/test/CodeGen/X86/memcmp-more-load-pairs.ll
@@ -1,15 +1,15 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
; NOTE: This is a copy of llvm/test/CodeGen/X86/memcmp.ll with more load pairs. Please keep it that way.
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
-; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
+; RUN: llc -max-loads-per-memcmp=4 -memcmp-num-loads-per-block=4 < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
; This tests codegen time inlining/optimization of memcmp
; rdar://6480398
@@ -78,8 +78,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length2_lt:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
-; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %ax
+; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %cx
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: movzwl %cx, %ecx
@@ -140,8 +140,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length3:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %ecx
-; X64-NEXT: movzwl (%rsi), %edx
; X64-NEXT: rolw $8, %cx
+; X64-NEXT: movzwl (%rsi), %edx
; X64-NEXT: rolw $8, %dx
; X64-NEXT: cmpw %dx, %cx
; X64-NEXT: jne .LBB9_3
@@ -494,8 +494,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length12:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB29_2
@@ -569,8 +569,8 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length16:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB33_2
@@ -639,8 +639,8 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length16_lt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB35_2
@@ -670,8 +670,8 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length16_gt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB36_2
@@ -743,15 +743,15 @@ define i32 @length24(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length24:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB38_3
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB38_3
@@ -835,15 +835,15 @@ define i1 @length24_lt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length24_lt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB40_3
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB40_3
@@ -873,15 +873,15 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length24_gt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB41_3
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rax
-; X64-NEXT: movq 8(%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq 8(%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB41_3
@@ -910,34 +910,34 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind {
define i1 @length24_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length24_eq_const:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pand %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE2-NEXT: retq
;
; X64-SSE41-LABEL: length24_eq_const:
; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: ptest %xmm0, %xmm0
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: por %xmm0, %xmm1
+; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
;
; X64-AVX-LABEL: length24_eq_const:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: setne %al
; X64-AVX-NEXT: retq
@@ -963,22 +963,22 @@ define i32 @length31(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length31:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB43_4
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB43_4
; X64-NEXT: # %bb.2: # %loadbb2
; X64-NEXT: movq 16(%rdi), %rcx
-; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB43_4
@@ -1031,11 +1031,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX-LABEL: length31_eq:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0
+; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: sete %al
; X64-AVX-NEXT: retq
@@ -1061,22 +1061,22 @@ define i1 @length31_lt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length31_lt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB45_4
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB45_4
; X64-NEXT: # %bb.2: # %loadbb2
; X64-NEXT: movq 16(%rdi), %rcx
-; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB45_4
@@ -1106,22 +1106,22 @@ define i1 @length31_gt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length31_gt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB46_4
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rax
-; X64-NEXT: movq 8(%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq 8(%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB46_4
; X64-NEXT: # %bb.2: # %loadbb2
; X64-NEXT: movq 16(%rdi), %rax
-; X64-NEXT: movq 16(%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq 16(%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB46_4
@@ -1177,11 +1177,11 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
;
; X64-AVX-LABEL: length31_eq_prefer128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0
+; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: sete %al
; X64-AVX-NEXT: retq
@@ -1206,34 +1206,34 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
define i1 @length31_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length31_eq_const:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pand %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE2-NEXT: retq
;
; X64-SSE41-LABEL: length31_eq_const:
; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm0
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: ptest %xmm0, %xmm0
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: por %xmm0, %xmm1
+; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
;
; X64-AVX-LABEL: length31_eq_const:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0
; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: setne %al
; X64-AVX-NEXT: retq
@@ -1259,22 +1259,22 @@ define i32 @length32(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length32:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB49_4
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB49_4
; X64-NEXT: # %bb.2: # %loadbb2
; X64-NEXT: movq 16(%rdi), %rcx
-; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB49_4
@@ -1372,22 +1372,22 @@ define i1 @length32_lt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length32_lt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB51_4
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rcx
-; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 8(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB51_4
; X64-NEXT: # %bb.2: # %loadbb2
; X64-NEXT: movq 16(%rdi), %rcx
-; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq 16(%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB51_4
@@ -1417,22 +1417,22 @@ define i1 @length32_gt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length32_gt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB52_4
; X64-NEXT: # %bb.1: # %loadbb1
; X64-NEXT: movq 8(%rdi), %rax
-; X64-NEXT: movq 8(%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq 8(%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB52_4
; X64-NEXT: # %bb.2: # %loadbb2
; X64-NEXT: movq 16(%rdi), %rax
-; X64-NEXT: movq 16(%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq 16(%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB52_4
@@ -1488,11 +1488,11 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
;
; X64-AVX-LABEL: length32_eq_prefer128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm0
+; X64-AVX-NEXT: vpxor 16(%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: sete %al
; X64-AVX-NEXT: retq
@@ -1517,24 +1517,24 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
define i1 @length32_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length32_eq_const:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pand %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE2-NEXT: retq
;
; X64-SSE41-LABEL: length32_eq_const:
; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: ptest %xmm0, %xmm0
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: por %xmm0, %xmm1
+; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
;
@@ -1814,8 +1814,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
; X64-AVX1-LABEL: length48_eq_const:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
@@ -1826,8 +1826,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
; X64-AVX2-LABEL: length48_eq_const:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
@@ -1838,8 +1838,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
; X64-AVX512-LABEL: length48_eq_const:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vptest %ymm0, %ymm0
@@ -1919,11 +1919,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX1-LABEL: length63_eq:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0
+; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
@@ -1931,11 +1931,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX2-LABEL: length63_eq:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0
+; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
@@ -1943,11 +1943,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX512-LABEL: length63_eq:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0
+; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX512-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX512-NEXT: vptest %ymm0, %ymm0
; X64-AVX512-NEXT: setne %al
; X64-AVX512-NEXT: vzeroupper
@@ -2004,16 +2004,16 @@ define i1 @length63_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length63_eq_const:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT: movdqu 47(%rdi), %xmm3
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm1
+; X64-SSE2-NEXT: movdqu 47(%rdi), %xmm2
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT: pand %xmm3, %xmm2
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm3
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: sete %al
@@ -2022,27 +2022,27 @@ define i1 @length63_eq_const(ptr %X) nounwind {
; X64-SSE41-LABEL: length63_eq_const:
; X64-SSE41: # %bb.0:
; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT: movdqu 47(%rdi), %xmm3
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm1
+; X64-SSE41-NEXT: movdqu 47(%rdi), %xmm2
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT: por %xmm3, %xmm2
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm3
+; X64-SSE41-NEXT: por %xmm2, %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE41-NEXT: por %xmm3, %xmm0
; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: por %xmm2, %xmm0
; X64-SSE41-NEXT: ptest %xmm0, %xmm0
; X64-SSE41-NEXT: sete %al
; X64-SSE41-NEXT: retq
;
; X64-AVX1-LABEL: length63_eq_const:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
@@ -2050,11 +2050,11 @@ define i1 @length63_eq_const(ptr %X) nounwind {
;
; X64-AVX2-LABEL: length63_eq_const:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
@@ -2062,11 +2062,11 @@ define i1 @length63_eq_const(ptr %X) nounwind {
;
; X64-AVX512-LABEL: length63_eq_const:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0
; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX512-NEXT: vptest %ymm0, %ymm0
; X64-AVX512-NEXT: sete %al
; X64-AVX512-NEXT: vzeroupper
@@ -2144,11 +2144,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX1-LABEL: length64_eq:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0
+; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
@@ -2156,11 +2156,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX2-LABEL: length64_eq:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
@@ -2235,16 +2235,16 @@ define i1 @length64_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length64_eq_const:
; X64-SSE2: # %bb.0:
; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm2
-; X64-SSE2-NEXT: movdqu 48(%rdi), %xmm3
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; X64-SSE2-NEXT: movdqu 32(%rdi), %xmm1
+; X64-SSE2-NEXT: movdqu 48(%rdi), %xmm2
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE2-NEXT: pand %xmm3, %xmm2
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm3
+; X64-SSE2-NEXT: pand %xmm2, %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE2-NEXT: pand %xmm3, %xmm0
; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pand %xmm2, %xmm0
; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: sete %al
@@ -2253,27 +2253,27 @@ define i1 @length64_eq_const(ptr %X) nounwind {
; X64-SSE41-LABEL: length64_eq_const:
; X64-SSE41: # %bb.0:
; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm2
-; X64-SSE41-NEXT: movdqu 48(%rdi), %xmm3
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
+; X64-SSE41-NEXT: movdqu 32(%rdi), %xmm1
+; X64-SSE41-NEXT: movdqu 48(%rdi), %xmm2
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm2
-; X64-SSE41-NEXT: por %xmm3, %xmm2
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm3
+; X64-SSE41-NEXT: por %xmm2, %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
+; X64-SSE41-NEXT: por %xmm3, %xmm0
; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: por %xmm2, %xmm0
; X64-SSE41-NEXT: ptest %xmm0, %xmm0
; X64-SSE41-NEXT: sete %al
; X64-SSE41-NEXT: retq
;
; X64-AVX1-LABEL: length64_eq_const:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
@@ -2281,11 +2281,11 @@ define i1 @length64_eq_const(ptr %X) nounwind {
;
; X64-AVX2-LABEL: length64_eq_const:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
@@ -2582,16 +2582,16 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind {
; X64-AVX1-LABEL: length127_eq:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm3
-; X64-AVX1-NEXT: vxorps 95(%rsi), %ymm3, %ymm3
-; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1
+; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm2
+; X64-AVX1-NEXT: vxorps 95(%rsi), %ymm2, %ymm2
+; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3
+; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm3, %ymm2
; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
@@ -2600,16 +2600,16 @@ define i1 @length127_eq(ptr %x, ptr %y) nounwind {
; X64-AVX2-LABEL: length127_eq:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm3
-; X64-AVX2-NEXT: vpxor 95(%rsi), %ymm3, %ymm3
-; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1
+; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm2
+; X64-AVX2-NEXT: vpxor 95(%rsi), %ymm2, %ymm2
+; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3
+; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm3, %ymm2
; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
@@ -2718,16 +2718,16 @@ define i1 @length127_eq_const(ptr %X) nounwind {
; X64-AVX1-LABEL: length127_eq_const:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm3
-; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1
+; X64-AVX1-NEXT: vmovups 95(%rdi), %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3
+; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
@@ -2736,16 +2736,16 @@ define i1 @length127_eq_const(ptr %X) nounwind {
; X64-AVX2-LABEL: length127_eq_const:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm3
-; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1
+; X64-AVX2-NEXT: vmovdqu 95(%rdi), %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3
+; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
@@ -2832,16 +2832,16 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind {
; X64-AVX1-LABEL: length128_eq:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm3
-; X64-AVX1-NEXT: vxorps 96(%rsi), %ymm3, %ymm3
-; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm2, %ymm2
-; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
-; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1
+; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm2
+; X64-AVX1-NEXT: vxorps 96(%rsi), %ymm2, %ymm2
+; X64-AVX1-NEXT: vxorps 64(%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3
+; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm3, %ymm2
; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
@@ -2850,16 +2850,16 @@ define i1 @length128_eq(ptr %x, ptr %y) nounwind {
; X64-AVX2-LABEL: length128_eq:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm3
-; X64-AVX2-NEXT: vpxor 96(%rsi), %ymm3, %ymm3
-; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm2, %ymm2
-; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
-; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1
+; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm2
+; X64-AVX2-NEXT: vpxor 96(%rsi), %ymm2, %ymm2
+; X64-AVX2-NEXT: vpxor 64(%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3
+; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm3, %ymm2
; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
@@ -2968,16 +2968,16 @@ define i1 @length128_eq_const(ptr %X) nounwind {
; X64-AVX1-LABEL: length128_eq_const:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm2
-; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm3
-; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; X64-AVX1-NEXT: vmovups 64(%rdi), %ymm1
+; X64-AVX1-NEXT: vmovups 96(%rdi), %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX1-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm3
+; X64-AVX1-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vorps %ymm2, %ymm0, %ymm0
+; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
@@ -2986,16 +2986,16 @@ define i1 @length128_eq_const(ptr %X) nounwind {
; X64-AVX2-LABEL: length128_eq_const:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm2
-; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm3
-; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
+; X64-AVX2-NEXT: vmovdqu 64(%rdi), %ymm1
+; X64-AVX2-NEXT: vmovdqu 96(%rdi), %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm2
-; X64-AVX2-NEXT: vpor %ymm3, %ymm2, %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm3
+; X64-AVX2-NEXT: vpor %ymm2, %ymm1, %ymm1
+; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vpor %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
@@ -3106,9 +3106,9 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind {
; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2
; X64-AVX512BW-NEXT: vpcmpneqb 64(%rsi), %zmm1, %k0
; X64-AVX512BW-NEXT: vpcmpneqb (%rsi), %zmm0, %k1
+; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k2
; X64-AVX512BW-NEXT: korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k1
-; X64-AVX512BW-NEXT: kortestq %k1, %k0
+; X64-AVX512BW-NEXT: kortestq %k2, %k0
; X64-AVX512BW-NEXT: setne %al
; X64-AVX512BW-NEXT: vzeroupper
; X64-AVX512BW-NEXT: retq
@@ -3120,9 +3120,9 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind {
; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
; X64-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k0
; X64-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1
+; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k2
; X64-AVX512F-NEXT: korw %k0, %k1, %k0
-; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-AVX512F-NEXT: kortestw %k1, %k0
+; X64-AVX512F-NEXT: kortestw %k2, %k0
; X64-AVX512F-NEXT: setne %al
; X64-AVX512F-NEXT: vzeroupper
; X64-AVX512F-NEXT: retq
@@ -3144,9 +3144,9 @@ define i1 @length192_eq(ptr %x, ptr %y) nounwind {
; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
; X64-MIC-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd (%rsi), %zmm0, %k1
+; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k2
; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0
+; X64-MIC-AVX512F-NEXT: kortestw %k2, %k0
; X64-MIC-AVX512F-NEXT: setne %al
; X64-MIC-AVX512F-NEXT: vzeroupper
; X64-MIC-AVX512F-NEXT: retq
@@ -3226,9 +3226,9 @@ define i1 @length192_eq_const(ptr %X) nounwind {
; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2
; X64-AVX512BW-NEXT: vpcmpneqb .L.str+64(%rip), %zmm1, %k0
; X64-AVX512BW-NEXT: vpcmpneqb .L.str(%rip), %zmm0, %k1
+; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k2
; X64-AVX512BW-NEXT: korq %k0, %k1, %k0
-; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512BW-NEXT: kortestq %k1, %k0
+; X64-AVX512BW-NEXT: kortestq %k2, %k0
; X64-AVX512BW-NEXT: sete %al
; X64-AVX512BW-NEXT: vzeroupper
; X64-AVX512BW-NEXT: retq
@@ -3240,9 +3240,9 @@ define i1 @length192_eq_const(ptr %X) nounwind {
; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
; X64-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k0
; X64-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1
+; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k2
; X64-AVX512F-NEXT: korw %k0, %k1, %k0
-; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-AVX512F-NEXT: kortestw %k1, %k0
+; X64-AVX512F-NEXT: kortestw %k2, %k0
; X64-AVX512F-NEXT: sete %al
; X64-AVX512F-NEXT: vzeroupper
; X64-AVX512F-NEXT: retq
@@ -3265,9 +3265,9 @@ define i1 @length192_eq_const(ptr %X) nounwind {
; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str(%rip), %zmm0, %k1
+; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k2
; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0
-; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1
-; X64-MIC-AVX512F-NEXT: kortestw %k1, %k0
+; X64-MIC-AVX512F-NEXT: kortestw %k2, %k0
; X64-MIC-AVX512F-NEXT: sete %al
; X64-MIC-AVX512F-NEXT: vzeroupper
; X64-MIC-AVX512F-NEXT: retq
@@ -3320,9 +3320,9 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind {
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm2
+; X64-AVX512BW-NEXT: vpcmpneqb 191(%rsi), %zmm2, %k0
; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512BW-NEXT: vpcmpneqb 191(%rsi), %zmm3, %k0
; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k1
; X64-AVX512BW-NEXT: korq %k0, %k1, %k0
; X64-AVX512BW-NEXT: vpcmpneqb 64(%rsi), %zmm1, %k1
@@ -3337,9 +3337,9 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind {
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2
+; X64-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm2, %k0
; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm3, %k0
; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1
; X64-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1
@@ -3364,9 +3364,9 @@ define i1 @length255_eq(ptr %x, ptr %y) nounwind {
; X64-MIC-AVX512F: # %bb.0:
; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2
+; X64-MIC-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm2, %k0
; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT: vpcmpneqd 191(%rsi), %zmm3, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1
; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1
@@ -3449,9 +3449,9 @@ define i1 @length255_eq_const(ptr %X) nounwind {
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm2
+; X64-AVX512BW-NEXT: vpcmpneqb .L.str+191(%rip), %zmm2, %k0
; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT: vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512BW-NEXT: vpcmpneqb .L.str+191(%rip), %zmm3, %k0
; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k1
; X64-AVX512BW-NEXT: korq %k0, %k1, %k0
; X64-AVX512BW-NEXT: vpcmpneqb .L.str+64(%rip), %zmm1, %k1
@@ -3466,9 +3466,9 @@ define i1 @length255_eq_const(ptr %X) nounwind {
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2
+; X64-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm2, %k0
; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3
-; X64-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm3, %k0
; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1
; X64-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1
@@ -3494,9 +3494,9 @@ define i1 @length255_eq_const(ptr %X) nounwind {
; X64-MIC-AVX512F: # %bb.0:
; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm2
+; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm2, %k0
; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT: vmovdqu64 191(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+191(%rip), %zmm3, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1
; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1
@@ -3555,9 +3555,9 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind {
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm2
+; X64-AVX512BW-NEXT: vpcmpneqb 192(%rsi), %zmm2, %k0
; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512BW-NEXT: vpcmpneqb 192(%rsi), %zmm3, %k0
; X64-AVX512BW-NEXT: vpcmpneqb 128(%rsi), %zmm2, %k1
; X64-AVX512BW-NEXT: korq %k0, %k1, %k0
; X64-AVX512BW-NEXT: vpcmpneqb 64(%rsi), %zmm1, %k1
@@ -3572,9 +3572,9 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind {
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2
+; X64-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm2, %k0
; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm3, %k0
; X64-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1
; X64-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1
@@ -3599,9 +3599,9 @@ define i1 @length256_eq(ptr %x, ptr %y) nounwind {
; X64-MIC-AVX512F: # %bb.0:
; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2
+; X64-MIC-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm2, %k0
; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT: vpcmpneqd 192(%rsi), %zmm3, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd 128(%rsi), %zmm2, %k1
; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd 64(%rsi), %zmm1, %k1
@@ -3684,9 +3684,9 @@ define i1 @length256_eq_const(ptr %X) nounwind {
; X64-AVX512BW: # %bb.0:
; X64-AVX512BW-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512BW-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm2
+; X64-AVX512BW-NEXT: vpcmpneqb .L.str+192(%rip), %zmm2, %k0
; X64-AVX512BW-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512BW-NEXT: vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512BW-NEXT: vpcmpneqb .L.str+192(%rip), %zmm3, %k0
; X64-AVX512BW-NEXT: vpcmpneqb .L.str+128(%rip), %zmm2, %k1
; X64-AVX512BW-NEXT: korq %k0, %k1, %k0
; X64-AVX512BW-NEXT: vpcmpneqb .L.str+64(%rip), %zmm1, %k1
@@ -3701,9 +3701,9 @@ define i1 @length256_eq_const(ptr %X) nounwind {
; X64-AVX512F: # %bb.0:
; X64-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2
+; X64-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm2, %k0
; X64-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3
-; X64-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm3, %k0
; X64-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1
; X64-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1
@@ -3729,9 +3729,9 @@ define i1 @length256_eq_const(ptr %X) nounwind {
; X64-MIC-AVX512F: # %bb.0:
; X64-MIC-AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
; X64-MIC-AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
+; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm2
+; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm2, %k0
; X64-MIC-AVX512F-NEXT: vmovdqu64 128(%rdi), %zmm2
-; X64-MIC-AVX512F-NEXT: vmovdqu64 192(%rdi), %zmm3
-; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+192(%rip), %zmm3, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+128(%rip), %zmm2, %k1
; X64-MIC-AVX512F-NEXT: korw %k0, %k1, %k0
; X64-MIC-AVX512F-NEXT: vpcmpneqd .L.str+64(%rip), %zmm1, %k1
diff --git a/llvm/test/CodeGen/X86/memcmp.ll b/llvm/test/CodeGen/X86/memcmp.ll
index 8fe1a581cd9c2b..7c1efc1122f889 100644
--- a/llvm/test/CodeGen/X86/memcmp.ll
+++ b/llvm/test/CodeGen/X86/memcmp.ll
@@ -1,14 +1,14 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=sse4.1 | FileCheck %s --check-prefixes=X64,X64-SSE,X64-SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx2 | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512bw,+prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512bw,-prefer-256-bit | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,+prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,-prefer-256-bit,-prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-AVX,X64-AVX512,X64-AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,+prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=avx512f,-prefer-256-bit,+prefer-mask-registers | FileCheck %s --check-prefixes=X64,X64-MIC-AVX,X64-MIC-AVX512F
; This tests codegen time inlining/optimization of memcmp
; rdar://6480398
@@ -104,8 +104,8 @@ define i1 @length2_lt(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length2_lt:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %eax
-; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %ax
+; X64-NEXT: movzwl (%rsi), %ecx
; X64-NEXT: rolw $8, %cx
; X64-NEXT: movzwl %ax, %eax
; X64-NEXT: movzwl %cx, %ecx
@@ -166,8 +166,8 @@ define i32 @length3(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length3:
; X64: # %bb.0:
; X64-NEXT: movzwl (%rdi), %ecx
-; X64-NEXT: movzwl (%rsi), %edx
; X64-NEXT: rolw $8, %cx
+; X64-NEXT: movzwl (%rsi), %edx
; X64-NEXT: rolw $8, %dx
; X64-NEXT: cmpw %dx, %cx
; X64-NEXT: jne .LBB11_3
@@ -520,8 +520,8 @@ define i32 @length12(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length12:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB31_2
@@ -578,8 +578,8 @@ define i32 @length15(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length15:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB34_2
@@ -606,8 +606,8 @@ define i1 @length15_lt(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length15_lt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB35_2
@@ -709,8 +709,8 @@ define i32 @length16(ptr %X, ptr %Y) nounwind {
; X64-LABEL: length16:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB39_2
@@ -779,8 +779,8 @@ define i1 @length16_lt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length16_lt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rcx
+; X64-NEXT: movq (%rsi), %rdx
; X64-NEXT: bswapq %rdx
; X64-NEXT: cmpq %rdx, %rcx
; X64-NEXT: jne .LBB41_2
@@ -810,8 +810,8 @@ define i1 @length16_gt(ptr %x, ptr %y) nounwind {
; X64-LABEL: length16_gt:
; X64: # %bb.0:
; X64-NEXT: movq (%rdi), %rax
-; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rax
+; X64-NEXT: movq (%rsi), %rcx
; X64-NEXT: bswapq %rcx
; X64-NEXT: cmpq %rcx, %rax
; X64-NEXT: jne .LBB42_2
@@ -978,34 +978,34 @@ define i1 @length24_gt(ptr %x, ptr %y) nounwind {
define i1 @length24_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length24_eq_const:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pand %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE2-NEXT: retq
;
; X64-SSE41-LABEL: length24_eq_const:
; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movq {{.*#+}} xmm1 = mem[0],zero
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movq {{.*#+}} xmm0 = mem[0],zero
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: ptest %xmm0, %xmm0
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: por %xmm0, %xmm1
+; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
;
; X64-AVX-LABEL: length24_eq_const:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
-; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: setne %al
; X64-AVX-NEXT: retq
@@ -1066,11 +1066,11 @@ define i1 @length31_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX-LABEL: length31_eq:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0
+; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: sete %al
; X64-AVX-NEXT: retq
@@ -1152,11 +1152,11 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
;
; X64-AVX-LABEL: length31_eq_prefer128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor 15(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0
+; X64-AVX-NEXT: vpxor 15(%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: sete %al
; X64-AVX-NEXT: retq
@@ -1181,34 +1181,34 @@ define i1 @length31_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
define i1 @length31_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length31_eq_const:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm1
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movdqu 15(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pand %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE2-NEXT: retq
;
; X64-SSE41-LABEL: length31_eq_const:
; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm1
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movdqu 15(%rdi), %xmm0
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: ptest %xmm0, %xmm0
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: por %xmm0, %xmm1
+; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
;
; X64-AVX-LABEL: length31_eq_const:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vmovdqu 15(%rdi), %xmm0
; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: setne %al
; X64-AVX-NEXT: retq
@@ -1370,11 +1370,11 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
;
; X64-AVX-LABEL: length32_eq_prefer128:
; X64-AVX: # %bb.0:
-; X64-AVX-NEXT: vmovdqu (%rdi), %xmm0
-; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm1
-; X64-AVX-NEXT: vpxor 16(%rsi), %xmm1, %xmm1
-; X64-AVX-NEXT: vpxor (%rsi), %xmm0, %xmm0
-; X64-AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu 16(%rdi), %xmm0
+; X64-AVX-NEXT: vpxor 16(%rsi), %xmm0, %xmm0
+; X64-AVX-NEXT: vmovdqu (%rdi), %xmm1
+; X64-AVX-NEXT: vpxor (%rsi), %xmm1, %xmm1
+; X64-AVX-NEXT: vpor %xmm0, %xmm1, %xmm0
; X64-AVX-NEXT: vptest %xmm0, %xmm0
; X64-AVX-NEXT: sete %al
; X64-AVX-NEXT: retq
@@ -1399,24 +1399,24 @@ define i1 @length32_eq_prefer128(ptr %x, ptr %y) nounwind "prefer-vector-width"=
define i1 @length32_eq_const(ptr %X) nounwind {
; X64-SSE2-LABEL: length32_eq_const:
; X64-SSE2: # %bb.0:
-; X64-SSE2-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE2-NEXT: pand %xmm1, %xmm0
-; X64-SSE2-NEXT: pmovmskb %xmm0, %eax
+; X64-SSE2-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE2-NEXT: pcmpeqb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE2-NEXT: pand %xmm0, %xmm1
+; X64-SSE2-NEXT: pmovmskb %xmm1, %eax
; X64-SSE2-NEXT: cmpl $65535, %eax # imm = 0xFFFF
; X64-SSE2-NEXT: setne %al
; X64-SSE2-NEXT: retq
;
; X64-SSE41-LABEL: length32_eq_const:
; X64-SSE41: # %bb.0:
-; X64-SSE41-NEXT: movdqu (%rdi), %xmm0
-; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm1
-; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: movdqu 16(%rdi), %xmm0
; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0
-; X64-SSE41-NEXT: por %xmm1, %xmm0
-; X64-SSE41-NEXT: ptest %xmm0, %xmm0
+; X64-SSE41-NEXT: movdqu (%rdi), %xmm1
+; X64-SSE41-NEXT: pxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1
+; X64-SSE41-NEXT: por %xmm0, %xmm1
+; X64-SSE41-NEXT: ptest %xmm1, %xmm1
; X64-SSE41-NEXT: setne %al
; X64-SSE41-NEXT: retq
;
@@ -1597,8 +1597,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
; X64-AVX1-LABEL: length48_eq_const:
; X64-AVX1: # %bb.0:
; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups 32(%rdi), %xmm1
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
@@ -1609,8 +1609,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
; X64-AVX2-LABEL: length48_eq_const:
; X64-AVX2: # %bb.0:
; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
@@ -1621,8 +1621,8 @@ define i1 @length48_eq_const(ptr %X) nounwind {
; X64-AVX512-LABEL: length48_eq_const:
; X64-AVX512: # %bb.0:
; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu 32(%rdi), %xmm1
; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
; X64-AVX512-NEXT: vptest %ymm0, %ymm0
@@ -1669,11 +1669,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX1-LABEL: length63_eq:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0
+; X64-AVX1-NEXT: vxorps 31(%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
@@ -1681,11 +1681,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX2-LABEL: length63_eq:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0
+; X64-AVX2-NEXT: vpxor 31(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
@@ -1693,11 +1693,11 @@ define i1 @length63_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX512-LABEL: length63_eq:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm1, %ymm1
-; X64-AVX512-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0
+; X64-AVX512-NEXT: vpxor 31(%rsi), %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX512-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX512-NEXT: vptest %ymm0, %ymm0
; X64-AVX512-NEXT: setne %al
; X64-AVX512-NEXT: vzeroupper
@@ -1764,11 +1764,11 @@ define i1 @length63_eq_const(ptr %X) nounwind {
;
; X64-AVX1-LABEL: length63_eq_const:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 31(%rdi), %ymm0
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
@@ -1776,11 +1776,11 @@ define i1 @length63_eq_const(ptr %X) nounwind {
;
; X64-AVX2-LABEL: length63_eq_const:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 31(%rdi), %ymm0
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
@@ -1788,11 +1788,11 @@ define i1 @length63_eq_const(ptr %X) nounwind {
;
; X64-AVX512-LABEL: length63_eq_const:
; X64-AVX512: # %bb.0:
-; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm1
-; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT: vmovdqu 31(%rdi), %ymm0
; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX512-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX512-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX512-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX512-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX512-NEXT: vptest %ymm0, %ymm0
; X64-AVX512-NEXT: sete %al
; X64-AVX512-NEXT: vzeroupper
@@ -1837,11 +1837,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX1-LABEL: length64_eq:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm1, %ymm1
-; X64-AVX1-NEXT: vxorps (%rsi), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0
+; X64-AVX1-NEXT: vxorps 32(%rsi), %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps (%rsi), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: setne %al
; X64-AVX1-NEXT: vzeroupper
@@ -1849,11 +1849,11 @@ define i1 @length64_eq(ptr %x, ptr %y) nounwind {
;
; X64-AVX2-LABEL: length64_eq:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm1, %ymm1
-; X64-AVX2-NEXT: vpxor (%rsi), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
+; X64-AVX2-NEXT: vpxor 32(%rsi), %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor (%rsi), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: setne %al
; X64-AVX2-NEXT: vzeroupper
@@ -1938,11 +1938,11 @@ define i1 @length64_eq_const(ptr %X) nounwind {
;
; X64-AVX1-LABEL: length64_eq_const:
; X64-AVX1: # %bb.0:
-; X64-AVX1-NEXT: vmovups (%rdi), %ymm0
-; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm1
-; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vmovups 32(%rdi), %ymm0
; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX1-NEXT: vorps %ymm1, %ymm0, %ymm0
+; X64-AVX1-NEXT: vmovups (%rdi), %ymm1
+; X64-AVX1-NEXT: vxorps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX1-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX1-NEXT: vptest %ymm0, %ymm0
; X64-AVX1-NEXT: sete %al
; X64-AVX1-NEXT: vzeroupper
@@ -1950,11 +1950,11 @@ define i1 @length64_eq_const(ptr %X) nounwind {
;
; X64-AVX2-LABEL: length64_eq_const:
; X64-AVX2: # %bb.0:
-; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm0
-; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm1
-; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vmovdqu 32(%rdi), %ymm0
; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-AVX2-NEXT: vpor %ymm1, %ymm0, %ymm0
+; X64-AVX2-NEXT: vmovdqu (%rdi), %ymm1
+; X64-AVX2-NEXT: vpxor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; X64-AVX2-NEXT: vpor %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: vptest %ymm0, %ymm0
; X64-AVX2-NEXT: sete %al
; X64-AVX2-NEXT: vzeroupper
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 7d882b772a64d1..c77400258485df 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1,13 +1,13 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop | FileCheck %s --check-prefixes=XOP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FALLBACK
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW-FALLBACK
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VLBW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+xop | FileCheck %s --check-prefixes=XOP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+xop,+avx | FileCheck %s --check-prefixes=XOP
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+xop,+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefixes=AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f,+avx512vl | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VL-FALLBACK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512BW-FALLBACK
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f,+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512,AVX512VL,AVX512VLBW
; These test cases are inspired by C++2a std::midpoint().
; See https://bugs.llvm.org/show_bug.cgi?id=40965
@@ -424,20 +424,20 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
;
; AVX2-LABEL: vec256_i64_signed_reg_reg:
; AVX2: # %bb.0:
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -485,20 +485,20 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm3
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -520,20 +520,20 @@ define <4 x i64> @vec256_i64_signed_reg_reg(<4 x i64> %a1, <4 x i64> %a2) nounwi
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm3
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
-; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
@@ -598,9 +598,9 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [9223372036854775808,9223372036854775808,9223372036854775808,9223372036854775808]
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm3
; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm2
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm4 = [1,1,1,1]
; AVX2-NEXT: vpcmpgtq %ymm3, %ymm2, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpor %ymm4, %ymm2, %ymm3
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
@@ -659,20 +659,20 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; AVX512F-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-NEXT: vpminuq %zmm1, %zmm0, %zmm3
; AVX512F-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -694,20 +694,20 @@ define <4 x i64> @vec256_i64_unsigned_reg_reg(<4 x i64> %a1, <4 x i64> %a2) noun
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm1 killed $ymm1 def $zmm1
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vpcmpnleuq %zmm1, %zmm0, %k1
-; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm2
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512BW-FALLBACK-NEXT: vpminuq %zmm1, %zmm0, %zmm3
; AVX512BW-FALLBACK-NEXT: vpmaxuq %zmm1, %zmm0, %zmm1
-; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
@@ -767,20 +767,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX2-LABEL: vec256_i64_signed_mem_reg:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpcmpgtq %ymm0, %ymm1, %ymm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsubq %ymm0, %ymm1, %ymm0
-; AVX2-NEXT: vpxor %ymm2, %ymm0, %ymm0
-; AVX2-NEXT: vpsubq %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm0, %ymm0
+; AVX2-NEXT: vpsubq %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpsrlq $1, %ymm0, %ymm3
; AVX2-NEXT: vpsrlq $33, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm0, %ymm0
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX2-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddq %ymm1, %ymm2, %ymm1
; AVX2-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX2-NEXT: retq
@@ -829,20 +829,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-NEXT: vpminsq %zmm0, %zmm1, %zmm3
; AVX512F-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
-; AVX512F-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlq $1, %ymm0, %ymm2
+; AVX512F-NEXT: vpsubq %ymm3, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlq $1, %ymm0, %ymm3
; AVX512F-NEXT: vpsrlq $33, %ymm0, %ymm0
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm0, %ymm0
-; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVX512F-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddq %ymm1, %ymm2, %ymm1
; AVX512F-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512F-NEXT: retq
@@ -865,20 +865,20 @@ define <4 x i64> @vec256_i64_signed_mem_reg(ptr %a1_addr, <4 x i64> %a2) nounwin
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm0, %zmm1, %k1
-; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm2
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512BW-FALLBACK-NEXT: vpminsq %zmm0, %zmm1, %zmm3
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm0, %zmm1, %zmm0
-; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm0, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm0, %ymm0
+; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm0, %ymm3
; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm0, %ymm0
+; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm4, %ymm0
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm0, %ymm0
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm2, %ymm1
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm1, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
@@ -937,20 +937,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; AVX2-LABEL: vec256_i64_signed_reg_mem:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -999,20 +999,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; AVX512F-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512F-NEXT: vmovdqa (%rdi), %ymm1
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm3
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -1035,20 +1035,20 @@ define <4 x i64> @vec256_i64_signed_reg_mem(<4 x i64> %a1, ptr %a2_addr) nounwin
; AVX512BW-FALLBACK-NEXT: # kill: def $ymm0 killed $ymm0 def $zmm0
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm3
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
-; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
@@ -1109,20 +1109,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %ymm0
; AVX2-NEXT: vmovdqa (%rsi), %ymm1
-; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm2
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX2-NEXT: vpor %ymm3, %ymm2, %ymm3
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX2-NEXT: vpcmpgtq %ymm1, %ymm0, %ymm3
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpsubq %ymm1, %ymm0, %ymm1
-; AVX2-NEXT: vpxor %ymm2, %ymm1, %ymm1
-; AVX2-NEXT: vpsubq %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX2-NEXT: vpxor %ymm3, %ymm1, %ymm1
+; AVX2-NEXT: vpsubq %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX2-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX2-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX2-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX2-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX2-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX2-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX2-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX2-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX2-NEXT: retq
@@ -1172,20 +1172,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512F-NEXT: vmovdqa (%rdi), %ymm0
; AVX512F-NEXT: vmovdqa (%rsi), %ymm1
; AVX512F-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512F-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512F-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512F-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512F-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512F-NEXT: vpminsq %zmm1, %zmm0, %zmm3
; AVX512F-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512F-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512F-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512F-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512F-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512F-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512F-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512F-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512F-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512F-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512F-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512F-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512F-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512F-NEXT: retq
@@ -1209,20 +1209,20 @@ define <4 x i64> @vec256_i64_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rdi), %ymm0
; AVX512BW-FALLBACK-NEXT: vmovdqa (%rsi), %ymm1
; AVX512BW-FALLBACK-NEXT: vpcmpgtq %zmm1, %zmm0, %k1
-; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm2, %ymm2, %ymm2
-; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm3 = [1,1,1,1]
-; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm2, %zmm3 {%k1}
-; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm2
+; AVX512BW-FALLBACK-NEXT: vpbroadcastq {{.*#+}} ymm2 = [1,1,1,1]
+; AVX512BW-FALLBACK-NEXT: vpcmpeqd %ymm3, %ymm3, %ymm3
+; AVX512BW-FALLBACK-NEXT: vmovdqa64 %zmm3, %zmm2 {%k1}
+; AVX512BW-FALLBACK-NEXT: vpminsq %zmm1, %zmm0, %zmm3
; AVX512BW-FALLBACK-NEXT: vpmaxsq %zmm1, %zmm0, %zmm1
-; AVX512BW-FALLBACK-NEXT: vpsubq %ymm2, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpsubq %ymm3, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $1, %ymm1, %ymm3
; AVX512BW-FALLBACK-NEXT: vpsrlq $33, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm3, %ymm4
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm1, %ymm1
+; AVX512BW-FALLBACK-NEXT: vpsrlq $32, %ymm2, %ymm4
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm4, %ymm3, %ymm4
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm4, %ymm1
; AVX512BW-FALLBACK-NEXT: vpsllq $32, %ymm1, %ymm1
-; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm3, %ymm2, %ymm2
+; AVX512BW-FALLBACK-NEXT: vpmuludq %ymm2, %ymm3, %ymm2
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm0, %ymm2, %ymm0
; AVX512BW-FALLBACK-NEXT: vpaddq %ymm1, %ymm0, %ymm0
; AVX512BW-FALLBACK-NEXT: retq
@@ -1434,7 +1434,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpminuw %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3
; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpmaxuw %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubw %ymm2, %ymm1, %ymm1
@@ -1450,7 +1450,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpcmpeqw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2
; AVX512VL-FALLBACK-NEXT: vpxor %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsubw %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddw %ymm0, %ymm1, %ymm0
@@ -2016,7 +2016,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
@@ -2169,7 +2169,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512F: # %bb.0:
; AVX512F-NEXT: vpminub %ymm1, %ymm0, %ymm2
; AVX512F-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT: vpternlogq $15, %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpternlogq {{.*#+}} zmm3 = ~zmm3
; AVX512F-NEXT: vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
; AVX512F-NEXT: vpmaxub %ymm1, %ymm0, %ymm1
; AVX512F-NEXT: vpsubb %ymm2, %ymm1, %ymm1
@@ -2193,8 +2193,8 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpcmpeqb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
+; AVX512VL-FALLBACK-NEXT: vpternlogq {{.*#+}} ymm2 = ~ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm1, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
@@ -2372,7 +2372,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm0, %ymm1, %ymm0
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm0, %ymm2, %ymm0
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm1, %ymm0, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
@@ -2550,7 +2550,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
@@ -2733,7 +2733,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm2, %ymm3, %ymm2
; AVX512VL-FALLBACK-NEXT: vpsrlw $1, %ymm2, %ymm2
; AVX512VL-FALLBACK-NEXT: vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT: vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT: vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
; AVX512VL-FALLBACK-NEXT: vpsubb %ymm1, %ymm2, %ymm1
; AVX512VL-FALLBACK-NEXT: vpaddb %ymm0, %ymm1, %ymm0
; AVX512VL-FALLBACK-NEXT: retq
diff --git a/llvm/test/CodeGen/X86/mul-constant-result.ll b/llvm/test/CodeGen/X86/mul-constant-result.ll
index 1f9e7a93ad0b90..d36bb7db6a5d65 100644
--- a/llvm/test/CodeGen/X86/mul-constant-result.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-result.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown -mcpu=generic | FileCheck %s --check-prefix=X86
; Incremental updates of the instruction depths should be enough for this test
; case.
@@ -13,9 +13,9 @@ define i32 @mult(i32, i32) local_unnamed_addr #0 {
; X86-NEXT: .cfi_def_cfa_offset 8
; X86-NEXT: .cfi_offset %esi, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: cmpl $2, %edx
; X86-NEXT: movl $1, %eax
; X86-NEXT: movl $1, %esi
+; X86-NEXT: cmpl $2, %edx
; X86-NEXT: jge .LBB0_2
; X86-NEXT: # %bb.1:
; X86-NEXT: movl %edx, %esi
diff --git a/llvm/test/CodeGen/X86/mul-i512.ll b/llvm/test/CodeGen/X86/mul-i512.ll
index 64f6746e616ede..83655d1ced54ae 100644
--- a/llvm/test/CodeGen/X86/mul-i512.ll
+++ b/llvm/test/CodeGen/X86/mul-i512.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown -mcpu=generic | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic | FileCheck %s --check-prefix=X64
define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-LABEL: test_512:
@@ -9,38 +9,39 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: subl $180, %esp
+; X86-NEXT: subl $184, %esp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 28(%eax), %ebx
-; X86-NEXT: movl 24(%eax), %ebp
-; X86-NEXT: movl (%edx), %esi
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 24(%edx), %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl (%ecx), %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %esi
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl 28(%esi), %ebp
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl 4(%eax), %edi
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edi, %ebp
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
@@ -53,27 +54,26 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 20(%ecx), %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 20(%ecx), %ebp
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebp, %edi
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: setb %bl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movzbl %bl, %ecx
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb %cl
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ebx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -81,40 +81,39 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 8(%eax), %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl 8(%ebx), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 12(%eax), %ecx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %esi, %ebp
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl 12(%ebx), %ebx
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %bl
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %ebp
; X86-NEXT: adcl $0, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
@@ -129,123 +128,123 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %esi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %bl
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ebp, (%esp) # 4-byte Folded Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl 8(%ecx), %ebp
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl 8(%ecx), %ebx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 12(%ecx), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %edx, %ebp
+; X86-NEXT: movl 12(%ecx), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %ebp, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: adcl %ecx, %ebx
; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl (%esi), %ebx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl (%ecx), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 4(%esi), %eax
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl 4(%ecx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebp, %ebx
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %ebp
-; X86-NEXT: setb %bl
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %ebx
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %esi
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %edi, %esi
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %esi, %ebp
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -256,8 +255,8 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
@@ -265,30 +264,29 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, %ebx
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: movl %ebx, %edx
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT: adcl %ebx, %eax
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT: adcl %edi, %eax
; X86-NEXT: adcl $0, %esi
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -305,63 +303,62 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl 16(%eax), %ecx
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl %edi, %ebx
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 20(%eax), %esi
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %esi, %ebp
-; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: addl %esi, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movzbl %cl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %edi, %ecx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl 20(%eax), %edi
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: mull %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %eax
+; X86-NEXT: addl %ecx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: adcl %esi, %ebx
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %esi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ecx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -375,27 +372,27 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl 28(%eax), %ecx
+; X86-NEXT: movl 28(%eax), %ebx
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: movzbl %bl, %eax
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -406,62 +403,63 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: adcl %ebx, %esi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl %esi, %edi
; X86-NEXT: setb %bl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: addl %esi, %ecx
-; X86-NEXT: movzbl %bl, %eax
-; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: addl %ebp, %esi
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ecx
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movzbl %bl, %esi
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT: adcl %esi, %eax
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: adcl $0, %eax
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: addl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT: adcl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: adcl $0, %edx
-; X86-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: adcl $0, %eax
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: addl (%esp), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
@@ -470,85 +468,84 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
+; X86-NEXT: mull %edi
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edi, %esi
+; X86-NEXT: addl %ebx, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %edi
-; X86-NEXT: setb %cl
+; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl %ebp, %esi
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movzbl %cl, %ecx
-; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: adcl %eax, %edx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, (%esp) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %ebp
; X86-NEXT: addl %ebx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %edi, %ebp
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, %ebx
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %edx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: addl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; X86-NEXT: addl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, %ebx
@@ -746,37 +743,40 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: imull %eax, %ecx
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: imull %esi, %eax
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: addl %edx, %eax
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: imull %ebp, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: imull %edi, %esi
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: addl %esi, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NEXT: imull %ecx, %esi
+; X86-NEXT: imull %edi, %esi
; X86-NEXT: addl %edx, %esi
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: addl %ebx, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl (%esp), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %ecx, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edi, %ebx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %edi
@@ -790,26 +790,26 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
-; X86-NEXT: addl %ebp, %eax
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, (%esp) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl 60(%edi), %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: imull %eax, %esi
; X86-NEXT: movl 56(%edi), %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %eax, %ebp
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl %esi, %edx
; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: movl 60(%edi), %eax
-; X86-NEXT: imull %ebp, %eax
-; X86-NEXT: addl %eax, %ecx
; X86-NEXT: movl 48(%edi), %esi
; X86-NEXT: movl 52(%edi), %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -837,15 +837,15 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: addl %ebp, %ebx
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %ebx, %ebp
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: adcl %ecx, %esi
; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
@@ -853,13 +853,13 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: adcl (%esp), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
@@ -890,6 +890,7 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: setb %cl
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %ebx
+; X86-NEXT: movl %ebx, %ebp
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl %cl, %eax
@@ -903,67 +904,67 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl 36(%esi), %ebp
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl 36(%esi), %eax
+; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %edi, %esi
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ecx, %ebx
-; X86-NEXT: setb %cl
-; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: adcl %ecx, %edi
+; X86-NEXT: setb %bl
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %ebx, %esi
-; X86-NEXT: movzbl %cl, %eax
+; X86-NEXT: addl %edi, %esi
+; X86-NEXT: movzbl %bl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: movl (%esp), %ecx # 4-byte Reload
-; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl (%esp), %ebx # 4-byte Reload
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
-; X86-NEXT: adcl $0, %ebx
; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: mull %edi
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: addl %ebp, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %ecx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: addl %edi, %ebp
+; X86-NEXT: addl %ebx, %ebp
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT: adcl %eax, %ebx
+; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: adcl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: adcl $0, %ebp
-; X86-NEXT: adcl $0, %ebx
+; X86-NEXT: adcl $0, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, %eax
@@ -974,44 +975,46 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ecx
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Reload
+; X86-NEXT: mull %edi
; X86-NEXT: movl %edx, %esi
-; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
; X86-NEXT: adcl %ecx, %esi
-; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, %esi
-; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT: movzbl %cl, %eax
; X86-NEXT: adcl %eax, %edx
; X86-NEXT: addl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; X86-NEXT: adcl %ebx, %edi
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
; X86-NEXT: adcl %eax, %esi
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl $0, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl 48(%esi), %edi
-; X86-NEXT: imull %edi, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl 48(%ecx), %edi
; X86-NEXT: movl %edi, %eax
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: mull %ebx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NEXT: mull %esi
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: movl 52(%esi), %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: imull %edi, %eax
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movl 52(%ecx), %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl %esi, %ebx
; X86-NEXT: imull %eax, %ebx
; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: movl 56(%esi), %eax
+; X86-NEXT: movl 56(%ecx), %eax
; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: imull %ebp, %esi
@@ -1027,92 +1030,94 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: adcl %ebx, %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %edx, %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl %ebp, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: addl %ecx, %ebx
-; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: addl %ebx, %edi
+; X86-NEXT: adcl $0, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %ebp, %edi
-; X86-NEXT: setb %bl
+; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: setb %cl
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT: mull %ecx
-; X86-NEXT: addl %edi, %eax
-; X86-NEXT: movzbl %bl, %ecx
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: movzbl %cl, %ecx
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl %esi, %edx
; X86-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: movl (%esp), %esi # 4-byte Reload
-; X86-NEXT: imull %esi, %ecx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl (%esp), %ebp # 4-byte Reload
+; X86-NEXT: movl %ebp, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: mull %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT: imull %ebp, %eax
+; X86-NEXT: addl %eax, %edx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl %edx, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: addl %ecx, %edx
-; X86-NEXT: imull {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; X86-NEXT: addl %edx, %edi
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: imull %edi, %ecx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: imull %ebx, %ecx
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT: mull %ebp
+; X86-NEXT: mull %ebx
; X86-NEXT: addl %ecx, %edx
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT: imull %ebp, %ecx
+; X86-NEXT: imull %ebx, %ecx
; X86-NEXT: addl %edx, %ecx
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebp, %eax
-; X86-NEXT: mull %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %esi
; X86-NEXT: addl %ecx, %esi
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: mull %ebp
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %esi, %edi
-; X86-NEXT: adcl %ebx, %ecx
-; X86-NEXT: setb %bl
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: adcl %edi, %ecx
+; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; X86-NEXT: mull %ebp
; X86-NEXT: addl %ecx, %eax
-; X86-NEXT: movzbl %bl, %ecx
+; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
; X86-NEXT: adcl %ecx, %edx
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
@@ -1122,19 +1127,19 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
-; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
; X86-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
-; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; X86-NEXT: adcl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
@@ -1160,12 +1165,12 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X86-NEXT: movl %esi, 36(%ecx)
; X86-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
; X86-NEXT: movl %esi, 40(%ecx)
-; X86-NEXT: movl %ebx, 44(%ecx)
+; X86-NEXT: movl %edi, 44(%ecx)
; X86-NEXT: movl %ebp, 48(%ecx)
-; X86-NEXT: movl %edi, 52(%ecx)
+; X86-NEXT: movl %ebx, 52(%ecx)
; X86-NEXT: movl %eax, 56(%ecx)
; X86-NEXT: movl %edx, 60(%ecx)
-; X86-NEXT: addl $180, %esp
+; X86-NEXT: addl $184, %esp
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -1182,255 +1187,253 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: pushq %rbx
; X64-NEXT: pushq %rax
; X64-NEXT: movq %rdx, (%rsp) # 8-byte Spill
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq (%rdi), %rbx
-; X64-NEXT: movq 8(%rdi), %rdi
-; X64-NEXT: movq 24(%rax), %r14
-; X64-NEXT: movq 16(%rax), %rax
-; X64-NEXT: movq (%rsi), %r8
-; X64-NEXT: movq 8(%rsi), %r11
-; X64-NEXT: movq %rsi, %r13
+; X64-NEXT: movq 24(%rdi), %r10
+; X64-NEXT: movq 16(%rdi), %rax
+; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq (%rsi), %r11
+; X64-NEXT: movq 8(%rsi), %rbp
+; X64-NEXT: movq %rsi, %r12
; X64-NEXT: movq %rax, %rsi
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
+; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rcx, %r10
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r11
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rcx, %r9
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: mulq %rbp
; X64-NEXT: movq %rdx, %rcx
; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %r10, %r15
-; X64-NEXT: adcq %r9, %rcx
+; X64-NEXT: addq %r9, %r15
+; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: movq 8(%rdi), %rdi
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %esi
-; X64-NEXT: movq %r14, %rax
-; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %r9
-; X64-NEXT: addq %rcx, %r9
-; X64-NEXT: adcq %rsi, %rdx
-; X64-NEXT: movq %rdx, %r12
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %r13
+; X64-NEXT: movq %rax, %r10
+; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: adcq %rsi, %r13
; X64-NEXT: movq %rbx, %rsi
; X64-NEXT: movq %rbx, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rbx
-; X64-NEXT: movq %rax, %r14
-; X64-NEXT: addq %rcx, %r14
-; X64-NEXT: adcq $0, %rbx
-; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: movq %rsi, %r8
+; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %r14, %rax
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %rbx, %rcx
-; X64-NEXT: setb %sil
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: mulq %r11
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %rbx
; X64-NEXT: addq %rcx, %rbx
+; X64-NEXT: adcq $0, %r14
+; X64-NEXT: movq %rsi, %rax
+; X64-NEXT: movq %rsi, %r11
+; X64-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: adcq %r14, %r9
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %rdi, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %r14
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r9, %rcx
; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: adcq %rax, %r14
-; X64-NEXT: addq %rbp, %rbx
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
; X64-NEXT: adcq %r15, %r14
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: adcq $0, %r12
+; X64-NEXT: adcq $0, %r10
+; X64-NEXT: adcq $0, %r13
+; X64-NEXT: movq %r12, %rbp
; X64-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r13, %rsi
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq 16(%r13), %r10
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r13
+; X64-NEXT: movq 16(%r12), %rsi
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r8
; X64-NEXT: movq %rdi, %rax
; X64-NEXT: movq %rdi, %r12
; X64-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq 24(%rsi), %rsi
-; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rdx, %rbx
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %r9, %r15
+; X64-NEXT: adcq $0, %rbx
+; X64-NEXT: movq 24(%rbp), %rdi
+; X64-NEXT: movq %r11, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %r11
-; X64-NEXT: addq %rbp, %r11
-; X64-NEXT: adcq %r15, %rcx
-; X64-NEXT: setb %dil
+; X64-NEXT: addq %r15, %r11
+; X64-NEXT: adcq %rbx, %r9
+; X64-NEXT: setb %bpl
; X64-NEXT: movq %r12, %rax
-; X64-NEXT: mulq %rsi
+; X64-NEXT: mulq %rdi
; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rbp
-; X64-NEXT: addq %rcx, %rbp
-; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %r9, %rbx
+; X64-NEXT: movzbl %bpl, %eax
; X64-NEXT: adcq %rax, %r15
-; X64-NEXT: addq %rbx, %r13
-; X64-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: addq %rcx, %r8
+; X64-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %r14, %r11
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: adcq $0, %rbx
; X64-NEXT: adcq $0, %r15
-; X64-NEXT: addq %r9, %rbp
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; X64-NEXT: setb %dil
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %r11
+; X64-NEXT: addq %r10, %rbx
+; X64-NEXT: adcq %r13, %r15
+; X64-NEXT: setb %bpl
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
; X64-NEXT: movq %r14, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rdx, %r9
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq $0, %r9
-; X64-NEXT: movq %r8, %rax
; X64-NEXT: mulq %rsi
; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: addq %rbx, %rax
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: adcq %r9, %rcx
-; X64-NEXT: setb %r8b
-; X64-NEXT: movq %r14, %rax
+; X64-NEXT: movq %rax, %r11
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; X64-NEXT: movq %r10, %rax
; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rcx, %r9
+; X64-NEXT: adcq $0, %r8
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rdi
+; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: addq %r9, %rax
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: adcq %r8, %rcx
+; X64-NEXT: setb %r8b
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %rdi
; X64-NEXT: addq %rcx, %rax
; X64-NEXT: movq %rax, %rcx
; X64-NEXT: movzbl %r8b, %eax
; X64-NEXT: adcq %rax, %rdx
-; X64-NEXT: addq %rbp, %r11
+; X64-NEXT: addq %rbx, %r11
; X64-NEXT: movq %r11, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: adcq %r15, %rbx
-; X64-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: adcq %r15, %r9
+; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movzbl %bpl, %eax
; X64-NEXT: adcq %rax, %rcx
; X64-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq $0, %rdx
; X64-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq 32(%rdi), %r15
-; X64-NEXT: imulq %r15, %rsi
-; X64-NEXT: movq %r15, %rax
-; X64-NEXT: mulq %r10
-; X64-NEXT: movq %rax, %rcx
-; X64-NEXT: addq %rsi, %rdx
-; X64-NEXT: movq 40(%rdi), %rsi
-; X64-NEXT: imulq %rsi, %r10
-; X64-NEXT: addq %rdx, %r10
-; X64-NEXT: movq 48(%rdi), %rax
-; X64-NEXT: movq %rdi, %r8
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq 32(%rcx), %rbx
+; X64-NEXT: imulq %rbx, %rdi
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %rsi
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rdi, %rdx
+; X64-NEXT: movq 40(%rcx), %r8
+; X64-NEXT: imulq %r8, %rsi
+; X64-NEXT: addq %rdx, %rsi
+; X64-NEXT: movq 48(%rcx), %rax
+; X64-NEXT: movq %rcx, %r10
; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Reload
-; X64-NEXT: imulq %r9, %rdi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: mulq %r11
-; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; X64-NEXT: imulq %r15, %rdi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; X64-NEXT: mulq %r14
+; X64-NEXT: movq %rax, %rcx
; X64-NEXT: addq %rdi, %rdx
-; X64-NEXT: movq 56(%r8), %r8
-; X64-NEXT: imulq %r11, %r8
-; X64-NEXT: addq %rdx, %r8
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq %r10, %r8
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r15
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r15
+; X64-NEXT: movq 56(%r10), %r11
+; X64-NEXT: imulq %r14, %r11
+; X64-NEXT: addq %rdx, %r11
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: adcq %rsi, %r11
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %rdi
-; X64-NEXT: movq %rax, %r15
-; X64-NEXT: addq %rcx, %r15
-; X64-NEXT: adcq $0, %rdi
-; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: movq %r15, %r10
+; X64-NEXT: mulq %rbx
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %rbx
+; X64-NEXT: addq %rdi, %rbx
+; X64-NEXT: adcq $0, %r9
+; X64-NEXT: movq %r14, %rax
+; X64-NEXT: mulq %r8
+; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r13
-; X64-NEXT: addq %r15, %r13
-; X64-NEXT: adcq %rdi, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %rsi
+; X64-NEXT: addq %rbx, %r13
+; X64-NEXT: adcq %r9, %r15
+; X64-NEXT: setb %sil
+; X64-NEXT: movq %r10, %rax
+; X64-NEXT: mulq %r8
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r10
-; X64-NEXT: addq %rcx, %r10
-; X64-NEXT: movzbl %dil, %eax
+; X64-NEXT: addq %r15, %r10
+; X64-NEXT: movzbl %sil, %eax
; X64-NEXT: adcq %rax, %r12
-; X64-NEXT: addq %rbx, %r10
-; X64-NEXT: adcq %r8, %r12
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; X64-NEXT: movq 48(%r8), %rsi
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, %rax
-; X64-NEXT: mulq %rsi
-; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %rcx, %r10
+; X64-NEXT: adcq %r11, %r12
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT: movq 56(%rdx), %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: imulq %rax, %rsi
+; X64-NEXT: movq 48(%rdx), %rcx
+; X64-NEXT: movq %rdx, %rdi
+; X64-NEXT: movq %rax, %rbp
+; X64-NEXT: mulq %rcx
+; X64-NEXT: movq %rax, %r9
+; X64-NEXT: addq %rsi, %rdx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; X64-NEXT: imulq %r14, %rsi
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: movq %r8, %rdx
-; X64-NEXT: movq 56(%r8), %rax
-; X64-NEXT: imulq %rdi, %rax
-; X64-NEXT: movq %rdi, %r8
-; X64-NEXT: addq %rax, %rsi
-; X64-NEXT: movq 32(%rdx), %rbp
-; X64-NEXT: movq 40(%rdx), %r9
+; X64-NEXT: imulq %r14, %rcx
+; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: movq 32(%rdi), %r15
+; X64-NEXT: movq 40(%rdi), %rbx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT: movq %rax, %rsi
+; X64-NEXT: imulq %rbx, %rsi
+; X64-NEXT: mulq %r15
; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: imulq %r9, %rdi
-; X64-NEXT: mulq %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %rdi, %rdx
+; X64-NEXT: addq %rsi, %rdx
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; X64-NEXT: imulq %rbp, %r11
+; X64-NEXT: imulq %r15, %r11
; X64-NEXT: addq %rdx, %r11
-; X64-NEXT: addq %rcx, %rbx
-; X64-NEXT: adcq %rsi, %r11
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq %rax, %rsi
-; X64-NEXT: movq %r9, %rax
-; X64-NEXT: mulq %r8
-; X64-NEXT: movq %rdx, %r15
-; X64-NEXT: movq %rax, %rdi
-; X64-NEXT: addq %rcx, %rdi
-; X64-NEXT: adcq $0, %r15
-; X64-NEXT: movq %rbp, %rax
-; X64-NEXT: mulq %r14
-; X64-NEXT: movq %rdx, %rcx
+; X64-NEXT: addq %r9, %rdi
+; X64-NEXT: adcq %rcx, %r11
+; X64-NEXT: movq %r15, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %r9
; X64-NEXT: movq %rax, %r8
-; X64-NEXT: addq %rdi, %r8
-; X64-NEXT: adcq %r15, %rcx
-; X64-NEXT: setb %dil
-; X64-NEXT: movq %r9, %rax
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %rbp
+; X64-NEXT: movq %rdx, %rbp
+; X64-NEXT: movq %rax, %rcx
+; X64-NEXT: addq %r9, %rcx
+; X64-NEXT: adcq $0, %rbp
+; X64-NEXT: movq %r15, %rax
; X64-NEXT: mulq %r14
-; X64-NEXT: addq %rcx, %rax
-; X64-NEXT: movzbl %dil, %ecx
+; X64-NEXT: movq %rdx, %r9
+; X64-NEXT: movq %rax, %r15
+; X64-NEXT: addq %rcx, %r15
+; X64-NEXT: adcq %rbp, %r9
+; X64-NEXT: setb %cl
+; X64-NEXT: movq %rbx, %rax
+; X64-NEXT: mulq %r14
+; X64-NEXT: addq %r9, %rax
+; X64-NEXT: movzbl %cl, %ecx
; X64-NEXT: adcq %rcx, %rdx
-; X64-NEXT: addq %rbx, %rax
+; X64-NEXT: addq %rdi, %rax
; X64-NEXT: adcq %r11, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq %r13, %r8
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: adcq %r13, %r15
; X64-NEXT: adcq %r10, %rax
; X64-NEXT: adcq %r12, %rdx
-; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: addq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
+; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Folded Reload
; X64-NEXT: adcq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
; X64-NEXT: movq (%rsp), %rcx # 8-byte Reload
@@ -1438,12 +1441,12 @@ define void @test_512(ptr %a, ptr %b, ptr %out) nounwind {
; X64-NEXT: movq %rdi, (%rcx)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
; X64-NEXT: movq %rdi, 8(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 16(%rcx)
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT: movq %rdi, 24(%rcx)
-; X64-NEXT: movq %rsi, 32(%rcx)
-; X64-NEXT: movq %r8, 40(%rcx)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq %rsi, 16(%rcx)
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT: movq %rsi, 24(%rcx)
+; X64-NEXT: movq %r8, 32(%rcx)
+; X64-NEXT: movq %r15, 40(%rcx)
; X64-NEXT: movq %rax, 48(%rcx)
; X64-NEXT: movq %rdx, 56(%rcx)
; X64-NEXT: addq $8, %rsp
diff --git a/llvm/test/CodeGen/X86/mul64.ll b/llvm/test/CodeGen/X86/mul64.ll
index 25d10b06402d26..91445dc5ff2583 100644
--- a/llvm/test/CodeGen/X86/mul64.ll
+++ b/llvm/test/CodeGen/X86/mul64.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i386-unknown | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i386-unknown -mcpu=generic | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=generic | FileCheck %s --check-prefix=X64
define i64 @foo(i64 %t, i64 %u) nounwind {
; X86-LABEL: foo:
diff --git a/llvm/test/CodeGen/X86/pr62653.ll b/llvm/test/CodeGen/X86/pr62653.ll
index b6a1bf47983dc7..fff7420d3e1778 100644
--- a/llvm/test/CodeGen/X86/pr62653.ll
+++ b/llvm/test/CodeGen/X86/pr62653.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=generic | FileCheck %s
define <64 x i4> @pr62653(<64 x i4> %a0) nounwind {
; CHECK-LABEL: pr62653:
diff --git a/llvm/test/CodeGen/X86/rotate-multi.ll b/llvm/test/CodeGen/X86/rotate-multi.ll
index 8b4c852fd7ef72..50a6964a130c18 100644
--- a/llvm/test/CodeGen/X86/rotate-multi.ll
+++ b/llvm/test/CodeGen/X86/rotate-multi.ll
@@ -1,5 +1,5 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=x86_64-- < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-- -mcpu=x86-64 < %s | FileCheck %s
; OR of two rotates of %a0(edi).
define i32 @f0(i32 %a0) #0 {
diff --git a/llvm/test/CodeGen/X86/sad.ll b/llvm/test/CodeGen/X86/sad.ll
index 245516974d15b5..bfa4fbae8a7d0a 100644
--- a/llvm/test/CodeGen/X86/sad.ll
+++ b/llvm/test/CodeGen/X86/sad.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
@a = dso_local global [1024 x i8] zeroinitializer, align 16
@b = dso_local global [1024 x i8] zeroinitializer, align 16
@@ -810,11 +810,11 @@ define dso_local i32 @sad_nonloop_32i8(ptr nocapture readonly %p, i64, ptr nocap
;
; AVX1-LABEL: sad_nonloop_32i8:
; AVX1: # %bb.0:
-; AVX1-NEXT: vmovdqu (%rdi), %xmm0
-; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
-; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1
-; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
-; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu 16(%rdi), %xmm0
+; AVX1-NEXT: vpsadbw 16(%rdx), %xmm0, %xmm0
+; AVX1-NEXT: vmovdqu (%rdi), %xmm1
+; AVX1-NEXT: vpsadbw (%rdx), %xmm1, %xmm1
+; AVX1-NEXT: vpaddq %xmm0, %xmm1, %xmm0
; AVX1-NEXT: vpshufd {{.*#+}} xmm1 = xmm0[2,3,2,3]
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
; AVX1-NEXT: vmovd %xmm0, %eax
@@ -892,12 +892,12 @@ define dso_local i32 @sad_nonloop_64i8(ptr nocapture readonly %p, i64, ptr nocap
; AVX1: # %bb.0:
; AVX1-NEXT: vmovdqu (%rdi), %xmm0
; AVX1-NEXT: vmovdqu 16(%rdi), %xmm1
-; AVX1-NEXT: vmovdqu 32(%rdi), %xmm2
-; AVX1-NEXT: vmovdqu 48(%rdi), %xmm3
-; AVX1-NEXT: vpsadbw 48(%rdx), %xmm3, %xmm3
+; AVX1-NEXT: vmovdqu 48(%rdi), %xmm2
+; AVX1-NEXT: vpsadbw 48(%rdx), %xmm2, %xmm2
; AVX1-NEXT: vpsadbw 16(%rdx), %xmm1, %xmm1
-; AVX1-NEXT: vpaddq %xmm3, %xmm1, %xmm1
-; AVX1-NEXT: vpsadbw 32(%rdx), %xmm2, %xmm2
+; AVX1-NEXT: vmovdqu 32(%rdi), %xmm3
+; AVX1-NEXT: vpaddq %xmm2, %xmm1, %xmm1
+; AVX1-NEXT: vpsadbw 32(%rdx), %xmm3, %xmm2
; AVX1-NEXT: vpsadbw (%rdx), %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm2, %xmm0, %xmm0
; AVX1-NEXT: vpaddq %xmm1, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/sext-vsetcc.ll b/llvm/test/CodeGen/X86/sext-vsetcc.ll
index 0990c0b12f79af..f17ec99058c8d5 100644
--- a/llvm/test/CodeGen/X86/sext-vsetcc.ll
+++ b/llvm/test/CodeGen/X86/sext-vsetcc.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512
declare void @use_v8i1(<8 x i1>)
declare void @use_v8i8(<8 x i8>)
@@ -57,7 +57,7 @@ define <8 x i16> @cmp_ne_load_const_volatile(ptr %x) nounwind {
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -107,7 +107,7 @@ define <8 x i16> @cmp_ne_load_const_extra_use1(ptr %x) nounwind {
; AVX512-NEXT: callq use_v8i8 at PLT
; AVX512-NEXT: vpxor %xmm0, %xmm0, %xmm0
; AVX512-NEXT: vpcmpeqb (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX512-NEXT: addq $24, %rsp
; AVX512-NEXT: vzeroupper
@@ -159,7 +159,7 @@ define <8 x i16> @cmp_ne_load_const_extra_use2(ptr %x) nounwind {
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
; AVX512-NEXT: vmovdqu64 %zmm0, (%rsp) # 64-byte Spill
; AVX512-NEXT: vpmovzxbw {{.*#+}} xmm0 = xmm0[0],zero,xmm0[1],zero,xmm0[2],zero,xmm0[3],zero,xmm0[4],zero,xmm0[5],zero,xmm0[6],zero,xmm0[7],zero
; AVX512-NEXT: vzeroupper
@@ -202,7 +202,7 @@ define <8 x i16> @cmp_ne_no_load_const(i64 %x) nounwind {
; AVX512-NEXT: vmovq %rdi, %xmm0
; AVX512-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
; AVX512-NEXT: vpmovsxbw %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -267,7 +267,7 @@ define <3 x i32> @cmp_ult_load_const_bad_type(ptr %x) nounwind {
; AVX512-NEXT: vmovd {{.*#+}} xmm0 = mem[0],zero,zero,zero
; AVX512-NEXT: vpmaxub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm1
; AVX512-NEXT: vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
; AVX512-NEXT: vpmovsxbd %xmm0, %xmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -326,7 +326,7 @@ define <2 x i64> @cmp_ne_zextload(ptr %x, ptr %y) nounwind {
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm0 = mem[0],zero,mem[1],zero
; AVX512-NEXT: vpmovzxdq {{.*#+}} xmm1 = mem[0],zero,mem[1],zero
; AVX512-NEXT: vpcmpeqq %xmm1, %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
; AVX512-NEXT: # kill: def $xmm0 killed $xmm0 killed $zmm0
; AVX512-NEXT: vzeroupper
; AVX512-NEXT: retq
@@ -414,7 +414,7 @@ define <8 x i32> @cmp_ne_zextload_from_legal_op(ptr %x, ptr %y) {
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
; AVX512-NEXT: vpcmpeqw (%rsi), %xmm0, %xmm0
-; AVX512-NEXT: vpternlogq $15, %zmm0, %zmm0, %zmm0
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm0 = ~zmm0
; AVX512-NEXT: vpmovsxwd %xmm0, %ymm0
; AVX512-NEXT: retq
%loadx = load <8 x i16>, ptr %x
@@ -679,7 +679,7 @@ define <8 x i32> @PR63946(<8 x i32> %a0, <8 x i32> %b0) nounwind {
; AVX512-NEXT: korw %k5, %k0, %k0
; AVX512-NEXT: korw %k6, %k0, %k0
; AVX512-NEXT: korw %k7, %k0, %k1
-; AVX512-NEXT: vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
; AVX512-NEXT: # kill: def $ymm0 killed $ymm0 killed $zmm0
; AVX512-NEXT: retq
entry:
diff --git a/llvm/test/CodeGen/X86/smul_fix.ll b/llvm/test/CodeGen/X86/smul_fix.ll
index ce56283df6010b..49d62b65fa1f6b 100644
--- a/llvm/test/CodeGen/X86/smul_fix.ll
+++ b/llvm/test/CodeGen/X86/smul_fix.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=generic | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mcpu=generic -mattr=cmov | FileCheck %s --check-prefix=X86
declare i4 @llvm.smul.fix.i4 (i4, i4, i32)
declare i32 @llvm.smul.fix.i32 (i32, i32, i32)
@@ -52,11 +52,11 @@ define i64 @func2(i64 %x, i64 %y) {
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %ecx
; X86-NEXT: movl %eax, %esi
@@ -160,38 +160,36 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; X86-LABEL: vec:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: imull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shldl $30, %edi, %ecx
; X86-NEXT: shldl $30, %eax, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: imull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: shldl $30, %eax, %ebx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: imull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: shldl $30, %eax, %ebp
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: imull {{[0-9]+}}(%esp)
+; X86-NEXT: shldl $30, %ebx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: shldl $30, %eax, %edx
-; X86-NEXT: movl %edx, 12(%ecx)
-; X86-NEXT: movl %ebp, 8(%ecx)
-; X86-NEXT: movl %ebx, 4(%ecx)
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %edx, 12(%ebx)
+; X86-NEXT: movl %edi, 8(%ebx)
+; X86-NEXT: movl %esi, 4(%ebx)
+; X86-NEXT: movl %ecx, (%ebx)
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
%tmp = call <4 x i32> @llvm.smul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2)
ret <4 x i32> %tmp
@@ -287,15 +285,15 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
; X86-NEXT: imull {{[0-9]+}}(%esp), %edx
; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %esi, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl $4
@@ -319,11 +317,11 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %esi
; X86-NEXT: addl %edx, %ebx
@@ -368,42 +366,42 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebp
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %edx, %ebp
-; X86-NEXT: adcl $0, %edi
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: addl %edx, %ebx
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: imull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
+; X86-NEXT: movl %edx, %ebp
; X86-NEXT: movl %eax, %ecx
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebp, %eax
-; X86-NEXT: adcl %edx, %edi
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %ebx
-; X86-NEXT: movl %edi, %ecx
+; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: adcl %edx, %esi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl $0, %ebp
+; X86-NEXT: movl %esi, %ecx
; X86-NEXT: subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ebx, %esi
-; X86-NEXT: sbbl $0, %esi
+; X86-NEXT: movl %ebp, %edi
+; X86-NEXT: sbbl $0, %edi
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %ebx, %esi
-; X86-NEXT: cmovnsl %edi, %ecx
-; X86-NEXT: movl %ecx, %edi
-; X86-NEXT: subl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %esi, %edx
+; X86-NEXT: cmovnsl %ebp, %edi
+; X86-NEXT: cmovnsl %esi, %ecx
+; X86-NEXT: movl %ecx, %esi
+; X86-NEXT: subl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: sbbl $0, %edx
; X86-NEXT: cmpl $0, {{[0-9]+}}(%esp)
-; X86-NEXT: cmovnsl %esi, %edx
-; X86-NEXT: cmovnsl %ecx, %edi
-; X86-NEXT: shldl $1, %edi, %edx
-; X86-NEXT: shrdl $31, %edi, %eax
+; X86-NEXT: cmovnsl %edi, %edx
+; X86-NEXT: cmovnsl %ecx, %esi
+; X86-NEXT: shldl $1, %esi, %edx
+; X86-NEXT: shrdl $31, %esi, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/statepoint-live-in.ll b/llvm/test/CodeGen/X86/statepoint-live-in.ll
index 787a33aa49b20e..d5e628d1401749 100644
--- a/llvm/test/CodeGen/X86/statepoint-live-in.ll
+++ b/llvm/test/CodeGen/X86/statepoint-live-in.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -O3 -restrict-statepoint-remat < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx10.11.0 -mcpu=x86-64 -verify-machineinstrs -O3 -restrict-statepoint-remat < %s | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
declare void @bar() #0
declare void @baz()
diff --git a/llvm/test/CodeGen/X86/statepoint-regs.ll b/llvm/test/CodeGen/X86/statepoint-regs.ll
index 5c26e29dce45ed..36c3a9b61aa684 100644
--- a/llvm/test/CodeGen/X86/statepoint-regs.ll
+++ b/llvm/test/CodeGen/X86/statepoint-regs.ll
@@ -1,7 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx10.11.0 -mcpu=x86-64 -verify-machineinstrs -O3 -use-registers-for-deopt-values -restrict-statepoint-remat=true < %s | FileCheck %s
target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
-target triple = "x86_64-apple-macosx10.11.0"
declare void @bar() #0
declare void @baz()
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index cd643cb8d63751..161c4288570215 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -3,7 +3,7 @@
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=X64,SSE,SSE2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=X64,AVX,AVX2
; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64-v4 | FileCheck %s --check-prefixes=X64,AVX,AVX512
-; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=i686-unknown-unknown -mcpu=generic | FileCheck %s --check-prefix=X86
define i8 @ucmp.8.8(i8 %x, i8 %y) nounwind {
; X64-LABEL: ucmp.8.8:
@@ -310,9 +310,9 @@ define i141 @ucmp_wide_result(i32 %x, i32 %y) nounwind {
; X86-LABEL: ucmp_wide_result:
; X86: # %bb.0:
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X86-NEXT: seta %cl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movsbl %cl, %ecx
; X86-NEXT: movl %ecx, (%eax)
@@ -469,27 +469,27 @@ define <4 x i32> @ucmp_normal_vectors(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
; X86-NEXT: seta %dl
; X86-NEXT: sbbb $0, %dl
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movsbl %dl, %edx
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
; X86-NEXT: seta %bl
; X86-NEXT: sbbb $0, %bl
-; X86-NEXT: movsbl %bl, %edi
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: seta %bl
-; X86-NEXT: sbbb $0, %bl
+; X86-NEXT: seta %bh
; X86-NEXT: movsbl %bl, %esi
+; X86-NEXT: sbbb $0, %bh
+; X86-NEXT: movsbl %bh, %edi
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: seta %cl
; X86-NEXT: sbbb $0, %cl
; X86-NEXT: movsbl %cl, %ecx
; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %esi, 8(%eax)
-; X86-NEXT: movl %edi, 4(%eax)
+; X86-NEXT: movl %edi, 8(%eax)
+; X86-NEXT: movl %esi, 4(%eax)
; X86-NEXT: movl %edx, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
@@ -604,33 +604,29 @@ define <4 x i8> @ucmp_narrow_vec_result(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; X86-LABEL: ucmp_narrow_vec_result:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebx
-; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: seta %cl
; X86-NEXT: sbbb $0, %cl
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
; X86-NEXT: seta %ch
; X86-NEXT: sbbb $0, %ch
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: seta %bl
-; X86-NEXT: sbbb $0, %bl
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
; X86-NEXT: seta %dl
; X86-NEXT: sbbb $0, %dl
-; X86-NEXT: movb %dl, 3(%eax)
-; X86-NEXT: movb %bl, 2(%eax)
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: seta %dh
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbb $0, %dh
+; X86-NEXT: movb %dh, 3(%eax)
+; X86-NEXT: movb %dl, 2(%eax)
; X86-NEXT: movb %ch, 1(%eax)
; X86-NEXT: movb %cl, (%eax)
; X86-NEXT: popl %esi
-; X86-NEXT: popl %edi
-; X86-NEXT: popl %ebx
; X86-NEXT: retl $4
%1 = call <4 x i8> @llvm.ucmp(<4 x i32> %x, <4 x i32> %y)
ret <4 x i8> %1
@@ -690,18 +686,18 @@ define <4 x i32> @ucmp_narrow_vec_op(<4 x i8> %x, <4 x i8> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movb {{[0-9]+}}(%esp), %ch
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %edx
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl
; X86-NEXT: seta %dl
; X86-NEXT: sbbb $0, %dl
+; X86-NEXT: movzbl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movsbl %dl, %edx
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %bl
; X86-NEXT: seta %bl
; X86-NEXT: sbbb $0, %bl
-; X86-NEXT: movsbl %bl, %esi
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ch
; X86-NEXT: seta %ch
+; X86-NEXT: movsbl %bl, %esi
; X86-NEXT: sbbb $0, %ch
; X86-NEXT: movsbl %ch, %edi
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %cl
@@ -819,7 +815,7 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
; AVX512-NEXT: vpcmpltub %xmm1, %xmm0, %k1
; AVX512-NEXT: vpcmpnleub %xmm1, %xmm0, %k2
; AVX512-NEXT: vpbroadcastd {{.*#+}} zmm0 {%k2} {z} = [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1]
-; AVX512-NEXT: vpternlogd $255, %zmm1, %zmm1, %zmm1
+; AVX512-NEXT: vpternlogd {{.*#+}} zmm1 = -1
; AVX512-NEXT: vmovdqa32 %zmm1, %zmm0 {%k1}
; AVX512-NEXT: retq
;
@@ -867,48 +863,48 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %dl
-; X86-NEXT: seta %bl
-; X86-NEXT: sbbb $0, %bl
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, (%esp) # 1-byte Spill
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT: seta %bh
-; X86-NEXT: sbbb $0, %bh
+; X86-NEXT: seta %bl
+; X86-NEXT: sbbb $0, %bl
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT: seta %al
-; X86-NEXT: sbbb $0, %al
-; X86-NEXT: movsbl %al, %eax
-; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: seta %bh
+; X86-NEXT: sbbb $0, %bh
; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
-; X86-NEXT: movsbl %al, %edi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: seta %ah
+; X86-NEXT: movsbl %al, %ecx
+; X86-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: sbbb $0, %ah
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
; X86-NEXT: seta %al
+; X86-NEXT: movsbl %ah, %edi
; X86-NEXT: sbbb $0, %al
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: seta %ah
; X86-NEXT: movsbl %al, %ebp
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT: seta %al
-; X86-NEXT: sbbb $0, %al
-; X86-NEXT: movsbl %al, %esi
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: sbbb $0, %ah
+; X86-NEXT: movb {{[0-9]+}}(%esp), %al
; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
; X86-NEXT: seta %al
+; X86-NEXT: movsbl %ah, %esi
; X86-NEXT: sbbb $0, %al
+; X86-NEXT: movb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: cmpb {{[0-9]+}}(%esp), %ah
+; X86-NEXT: seta %ah
; X86-NEXT: movsbl %al, %edx
-; X86-NEXT: movzbl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: cmpb {{[0-9]+}}(%esp), %al
-; X86-NEXT: seta %al
-; X86-NEXT: sbbb $0, %al
-; X86-NEXT: movsbl %al, %ecx
+; X86-NEXT: sbbb $0, %ah
+; X86-NEXT: movsbl %ah, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl %ecx, 60(%eax)
; X86-NEXT: movl %edx, 56(%eax)
@@ -921,10 +917,10 @@ define <16 x i32> @ucmp_wide_vec_result(<16 x i8> %x, <16 x i8> %y) nounwind {
; X86-NEXT: movsbl %bh, %ecx
; X86-NEXT: movl %ecx, 36(%eax)
; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT: movsbl (%esp), %edx # 1-byte Folded Reload
+; X86-NEXT: movsbl %bl, %edx
; X86-NEXT: movl %edx, 32(%eax)
; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT: movsbl %bl, %edi
+; X86-NEXT: movsbl (%esp), %edi # 1-byte Folded Reload
; X86-NEXT: movl %edi, 28(%eax)
; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
; X86-NEXT: movsbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
@@ -1358,10 +1354,10 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
; X86-NEXT: subl $12, %esp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebp
@@ -1369,47 +1365,47 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %edx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: seta %al
; X86-NEXT: sbbb $0, %al
; X86-NEXT: movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT: cmpl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: cmpl {{[0-9]+}}(%esp), %eax
; X86-NEXT: seta %bh
; X86-NEXT: sbbb $0, %bh
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
diff --git a/llvm/test/CodeGen/X86/umul-with-overflow.ll b/llvm/test/CodeGen/X86/umul-with-overflow.ll
index ccabb360a990c9..e18544448b7475 100644
--- a/llvm/test/CodeGen/X86/umul-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/umul-with-overflow.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mcpu=i686 | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=x86-64 | FileCheck %s --check-prefix=X64
declare {i32, i1} @llvm.umul.with.overflow.i32(i32 %a, i32 %b)
@@ -519,63 +519,62 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X64-NEXT: pushq %r13
; X64-NEXT: pushq %r12
; X64-NEXT: pushq %rbx
-; X64-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: movq %r8, %r11
-; X64-NEXT: movq %rcx, %r8
-; X64-NEXT: movq %rdx, %rcx
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: movq %rdx, %r8
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %rbx
; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %r14
; X64-NEXT: movq %rdx, %r14
; X64-NEXT: movq %rax, %r15
; X64-NEXT: addq %rbx, %r15
; X64-NEXT: adcq $0, %r14
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq %r10, %rbx
+; X64-NEXT: mulq %r10
; X64-NEXT: movq %rdx, %rbp
-; X64-NEXT: movq %rax, %rbx
-; X64-NEXT: addq %r15, %rbx
+; X64-NEXT: addq %r15, %rax
+; X64-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; X64-NEXT: adcq %r14, %rbp
; X64-NEXT: setb %al
; X64-NEXT: movzbl %al, %r10d
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r12
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %r12
; X64-NEXT: movq %rax, %r13
; X64-NEXT: addq %rbp, %r13
; X64-NEXT: adcq %r10, %r12
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT: mulq %r10
; X64-NEXT: movq %rdx, %r15
; X64-NEXT: movq %rax, %r14
; X64-NEXT: movq %r11, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: mulq %r10
; X64-NEXT: movq %rdx, %rbp
; X64-NEXT: movq %rax, %r10
; X64-NEXT: addq %r15, %r10
; X64-NEXT: adcq $0, %rbp
-; X64-NEXT: movq %r8, %rax
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rcx, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rax, %r15
; X64-NEXT: addq %r10, %r15
; X64-NEXT: adcq %rbp, %rdx
-; X64-NEXT: imulq %r9, %r11
-; X64-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT: imulq %rbx, %r11
; X64-NEXT: addq %r13, %r14
; X64-NEXT: adcq %r12, %r15
; X64-NEXT: adcq %rdx, %r11
+; X64-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; X64-NEXT: movq %rsi, %rax
-; X64-NEXT: mulq %r9
-; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rax, %r12
-; X64-NEXT: movq %rcx, %rax
-; X64-NEXT: mulq %r9
+; X64-NEXT: movq %rdx, %r10
+; X64-NEXT: movq %r8, %rax
+; X64-NEXT: mulq %rbx
; X64-NEXT: movq %rdx, %r13
; X64-NEXT: movq %rax, %rbp
; X64-NEXT: addq %r10, %rbp
@@ -585,19 +584,19 @@ define i300 @test4(i300 %a, i300 %b) nounwind {
; X64-NEXT: mulq %r10
; X64-NEXT: addq %rbp, %rax
; X64-NEXT: adcq %r13, %rdx
-; X64-NEXT: imulq %r10, %rcx
-; X64-NEXT: addq %rdx, %rcx
+; X64-NEXT: imulq %r10, %r8
+; X64-NEXT: addq %rdx, %r8
; X64-NEXT: addq %r14, %r12
; X64-NEXT: adcq %r15, %rax
-; X64-NEXT: adcq %r11, %rcx
-; X64-NEXT: imulq %r9, %r8
-; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT: adcq %r11, %r8
+; X64-NEXT: imulq %rbx, %rcx
+; X64-NEXT: imulq {{[0-9]+}}(%rsp), %r9
; X64-NEXT: imulq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT: addq %rdx, %rsi
-; X64-NEXT: addq %r8, %rsi
+; X64-NEXT: addq %r9, %rsi
; X64-NEXT: addq %rcx, %rsi
-; X64-NEXT: movq %rbx, 8(%rdi)
+; X64-NEXT: addq %r8, %rsi
+; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT: movq %rcx, 8(%rdi)
; X64-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
; X64-NEXT: movq %rcx, (%rdi)
; X64-NEXT: movq %r12, 16(%rdi)
diff --git a/llvm/test/CodeGen/X86/umul_fix.ll b/llvm/test/CodeGen/X86/umul_fix.ll
index eacc714b49a4d4..c4473e4dc7ffed 100644
--- a/llvm/test/CodeGen/X86/umul_fix.ll
+++ b/llvm/test/CodeGen/X86/umul_fix.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=i686 -mattr=cmov | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-linux -mcpu=generic | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686 -mcpu=generic -mattr=cmov | FileCheck %s --check-prefix=X86
declare i4 @llvm.umul.fix.i4 (i4, i4, i32)
declare i32 @llvm.umul.fix.i32 (i32, i32, i32)
@@ -43,26 +43,26 @@ define i64 @func2(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %esi, %eax
+; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %edi
-; X86-NEXT: movl %eax, %ebx
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %eax, %esi
-; X86-NEXT: addl %edx, %ebx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ecx, %eax
; X86-NEXT: mull %ebp
-; X86-NEXT: addl %ebx, %eax
-; X86-NEXT: adcl %edi, %edx
-; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: addl %ecx, %edx
+; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: addl %edx, %edi
+; X86-NEXT: adcl $0, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull %ebp
+; X86-NEXT: addl %edi, %eax
+; X86-NEXT: adcl %esi, %edx
+; X86-NEXT: imull {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: addl %ebx, %edx
; X86-NEXT: shldl $30, %eax, %edx
-; X86-NEXT: shldl $30, %esi, %eax
+; X86-NEXT: shldl $30, %ecx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -116,38 +116,36 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
;
; X86-LABEL: vec:
; X86: # %bb.0:
-; X86-NEXT: pushl %ebp
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT: shldl $30, %edi, %ecx
; X86-NEXT: shldl $30, %eax, %esi
-; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: shldl $30, %eax, %ebx
-; X86-NEXT: movl %ebp, %eax
+; X86-NEXT: movl %edx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebp
-; X86-NEXT: shldl $30, %eax, %ebp
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %eax, %ebx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: shldl $30, %ebx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: shldl $30, %eax, %edx
-; X86-NEXT: movl %edx, 12(%ecx)
-; X86-NEXT: movl %ebp, 8(%ecx)
-; X86-NEXT: movl %ebx, 4(%ecx)
-; X86-NEXT: movl %esi, (%ecx)
-; X86-NEXT: movl %ecx, %eax
+; X86-NEXT: movl %edx, 12(%ebx)
+; X86-NEXT: movl %edi, 8(%ebx)
+; X86-NEXT: movl %esi, 4(%ebx)
+; X86-NEXT: movl %ecx, (%ebx)
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
-; X86-NEXT: popl %ebp
; X86-NEXT: retl $4
%tmp = call <4 x i32> @llvm.umul.fix.v4i32(<4 x i32> %x, <4 x i32> %y, i32 2)
ret <4 x i32> %tmp
@@ -236,15 +234,15 @@ define <4 x i32> @vec2(<4 x i32> %x, <4 x i32> %y) nounwind {
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
; X86-NEXT: movl {{[0-9]+}}(%esp), %edx
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
; X86-NEXT: imull {{[0-9]+}}(%esp), %esi
; X86-NEXT: imull {{[0-9]+}}(%esp), %edx
; X86-NEXT: imull {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl %ecx, 12(%eax)
-; X86-NEXT: movl %edx, 8(%eax)
-; X86-NEXT: movl %esi, 4(%eax)
-; X86-NEXT: movl %edi, (%eax)
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: imull {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, 12(%eax)
+; X86-NEXT: movl %ecx, 8(%eax)
+; X86-NEXT: movl %edx, 4(%eax)
+; X86-NEXT: movl %esi, (%eax)
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: retl $4
@@ -268,11 +266,11 @@ define i64 @func7(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %esi
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %edi
+; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull %ebp
; X86-NEXT: addl %edx, %edi
@@ -306,31 +304,30 @@ define i64 @func8(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebp
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: movl %esi, %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edx, %edi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %edi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %edx, %esi
; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebp
-; X86-NEXT: movl %edx, %esi
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %edi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %ebx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: adcl %edx, %ecx
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: adcl $0, %esi
-; X86-NEXT: shldl $1, %ecx, %esi
+; X86-NEXT: adcl $0, %edi
+; X86-NEXT: shldl $1, %ecx, %edi
; X86-NEXT: shrdl $31, %ecx, %eax
-; X86-NEXT: movl %esi, %edx
+; X86-NEXT: movl %edi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
@@ -354,30 +351,30 @@ define i64 @func9(i64 %x, i64 %y) nounwind {
; X86-NEXT: pushl %ebx
; X86-NEXT: pushl %edi
; X86-NEXT: pushl %esi
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT: movl %edi, %eax
-; X86-NEXT: mull %esi
-; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl %esi, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: movl %edi, %eax
+; X86-NEXT: movl %edx, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %ebp, %ebx
-; X86-NEXT: adcl $0, %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %esi
; X86-NEXT: movl %edx, %edi
+; X86-NEXT: addl %ebp, %edi
+; X86-NEXT: adcl $0, %ecx
+; X86-NEXT: movl %ebx, %eax
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %esi
; X86-NEXT: movl %eax, %ebp
-; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT: movl %ebx, %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
-; X86-NEXT: addl %ebx, %eax
+; X86-NEXT: addl %edi, %eax
; X86-NEXT: adcl %edx, %ecx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: addl %ebp, %ecx
-; X86-NEXT: adcl $0, %edi
+; X86-NEXT: adcl $0, %esi
; X86-NEXT: movl %ecx, %eax
-; X86-NEXT: movl %edi, %edx
+; X86-NEXT: movl %esi, %edx
; X86-NEXT: popl %esi
; X86-NEXT: popl %edi
; X86-NEXT: popl %ebx
diff --git a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
index 82603b35ba7128..90b5a44d301ac7 100644
--- a/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
+++ b/llvm/test/CodeGen/X86/umulo-128-legalisation-lowering.ll
@@ -1,6 +1,6 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefixes=X64
-; RUN: llc < %s -mtriple=i686-unknown-linux-gnu | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=generic | FileCheck %s --check-prefixes=X64
+; RUN: llc < %s -mtriple=i686-unknown-linux-gnu -mcpu=generic | FileCheck %s --check-prefixes=X86
define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X64-LABEL: muloti_test:
@@ -44,60 +44,62 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: .cfi_offset %edi, -16
; X86-NEXT: .cfi_offset %ebx, -12
; X86-NEXT: .cfi_offset %ebp, -8
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: mull %ecx
; X86-NEXT: movl %ecx, %ebx
; X86-NEXT: movl %eax, %ecx
+; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT: movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; X86-NEXT: movl %esi, %eax
; X86-NEXT: mull %edi
-; X86-NEXT: leal (%ecx,%eax), %esi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %edi, %eax
; X86-NEXT: mull %ebx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %esi, %ecx
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: leal (%ecx,%esi), %eax
+; X86-NEXT: addl %eax, %edi
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %eax, %esi
+; X86-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT: mull %ecx
+; X86-NEXT: movl %ecx, %ebp
+; X86-NEXT: movl %eax, %ecx
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %ebx
; X86-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull %ebx
-; X86-NEXT: leal (%esi,%eax), %esi
+; X86-NEXT: movl %eax, %esi
; X86-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl %ebx, %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull %ebp
; X86-NEXT: movl %eax, %ebp
; X86-NEXT: movl %edx, %ebx
-; X86-NEXT: addl %esi, %ebx
+; X86-NEXT: leal (%ecx,%esi), %eax
+; X86-NEXT: addl %eax, %ebx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-NEXT: adcl %ecx, %ebx
+; X86-NEXT: adcl %edi, %ebx
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
+; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ecx
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT: mull %edi
-; X86-NEXT: movl %edx, %esi
-; X86-NEXT: movl %eax, %edi
-; X86-NEXT: addl %ecx, %edi
-; X86-NEXT: adcl $0, %esi
+; X86-NEXT: mull {{[0-9]+}}(%esp)
+; X86-NEXT: movl %edx, %edi
+; X86-NEXT: movl %eax, %esi
+; X86-NEXT: addl %ecx, %esi
+; X86-NEXT: adcl $0, %edi
; X86-NEXT: movl {{[0-9]+}}(%esp), %eax
; X86-NEXT: mull {{[0-9]+}}(%esp)
; X86-NEXT: movl %edx, %ecx
-; X86-NEXT: addl %edi, %eax
+; X86-NEXT: addl %esi, %eax
; X86-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NEXT: adcl %esi, %ecx
+; X86-NEXT: adcl %edi, %ecx
; X86-NEXT: setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
; X86-NEXT: movl {{[0-9]+}}(%esp), %esi
; X86-NEXT: movl %esi, %eax
@@ -129,7 +131,6 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
; X86-NEXT: orb %ch, %bl
-; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bl # 1-byte Folded Reload
; X86-NEXT: orl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
; X86-NEXT: setne %bh
; X86-NEXT: orl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
@@ -145,6 +146,7 @@ define { i128, i8 } @muloti_test(i128 %l, i128 %r) unnamed_addr #0 {
; X86-NEXT: orb %bl, %al
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
+; X86-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %al # 1-byte Folded Reload
; X86-NEXT: andb $1, %al
; X86-NEXT: movb %al, 16(%ecx)
; X86-NEXT: movl %ecx, %eax
diff --git a/llvm/test/CodeGen/X86/v8i1-masks.ll b/llvm/test/CodeGen/X86/v8i1-masks.ll
index 67b7eb48e4cb3a..b6538a6397d45a 100644
--- a/llvm/test/CodeGen/X86/v8i1-masks.ll
+++ b/llvm/test/CodeGen/X86/v8i1-masks.ll
@@ -1,10 +1,10 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X86
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx | FileCheck %s --check-prefix=X64
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX2
-; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X86-AVX512
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mattr=+avx512vl | FileCheck %s --check-prefix=X64-AVX512
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=generic -mattr=+avx | FileCheck %s --check-prefix=X86
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic -mattr=+avx | FileCheck %s --check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=generic -mattr=+avx2 | FileCheck %s --check-prefix=X86-AVX2
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic -mattr=+avx2 | FileCheck %s --check-prefix=X64-AVX2
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=generic -mattr=+avx512vl | FileCheck %s --check-prefix=X86-AVX512
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic -mattr=+avx512vl | FileCheck %s --check-prefix=X64-AVX512
define void @and_masks(ptr %a, ptr %b, ptr %c) nounwind uwtable noinline ssp {
; X86-LABEL: and_masks:
@@ -257,8 +257,8 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 {
; X86-AVX2-LABEL: two_ands:
; X86-AVX2: ## %bb.0: ## %entry
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0
; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X86-AVX2-NEXT: retl
@@ -266,8 +266,8 @@ define <8 x i32> @two_ands(<8 x float> %x) local_unnamed_addr #0 {
; X64-AVX2-LABEL: two_ands:
; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
@@ -301,9 +301,9 @@ define <8 x i32> @three_ands(<8 x float> %x) {
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X86-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X86-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X86-NEXT: vandps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
@@ -312,9 +312,9 @@ define <8 x i32> @three_ands(<8 x float> %x) {
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -377,11 +377,11 @@ define <8 x i32> @four_ands(<8 x float> %x) {
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT: vandps %ymm3, %ymm2, %ymm2
; X86-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X86-NEXT: vandps %ymm3, %ymm0, %ymm0
; X86-NEXT: vandps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
@@ -390,11 +390,11 @@ define <8 x i32> @four_ands(<8 x float> %x) {
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm3, %ymm0, %ymm0
; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -465,30 +465,30 @@ define <8 x i32> @five_ands(<8 x float> %x) {
; X86-LABEL: five_ands:
; X86: ## %bb.0: ## %entry
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
+; X86-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
; X86-NEXT: vandps %ymm3, %ymm2, %ymm2
; X86-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vandps %ymm0, %ymm2, %ymm0
; X86-NEXT: vandps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: five_ands:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
+; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
; X64-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -504,9 +504,9 @@ define <8 x i32> @five_ands(<8 x float> %x) {
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X86-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X86-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X86-AVX2-NEXT: retl
;
@@ -522,9 +522,9 @@ define <8 x i32> @five_ands(<8 x float> %x) {
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X64-AVX2-NEXT: vandps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
;
@@ -585,8 +585,8 @@ define <8 x i32> @two_or(<8 x float> %x) {
; X86-AVX2-LABEL: two_or:
; X86-AVX2: ## %bb.0: ## %entry
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X86-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0
; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-AVX2-NEXT: retl
@@ -594,8 +594,8 @@ define <8 x i32> @two_or(<8 x float> %x) {
; X64-AVX2-LABEL: two_or:
; X64-AVX2: ## %bb.0: ## %entry
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
+; X64-AVX2-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-AVX2-NEXT: vcmpltps %ymm2, %ymm0, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
@@ -631,9 +631,9 @@ define <8 x i32> @three_or(<8 x float> %x) {
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X86-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X86-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
@@ -642,9 +642,9 @@ define <8 x i32> @three_or(<8 x float> %x) {
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
-; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -711,11 +711,11 @@ define <8 x i32> @four_or(<8 x float> %x) {
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X86-NEXT: vorps %ymm3, %ymm2, %ymm2
; X86-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
+; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X86-NEXT: vorps %ymm3, %ymm0, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
@@ -724,11 +724,11 @@ define <8 x i32> @four_or(<8 x float> %x) {
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
-; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X64-NEXT: vorps %ymm3, %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -805,30 +805,30 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X86-LABEL: five_or:
; X86: ## %bb.0: ## %entry
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
-; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
+; X86-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
; X86-NEXT: vorps %ymm3, %ymm2, %ymm2
; X86-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vorps %ymm0, %ymm2, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: five_or:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
-; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
-; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
+; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
+; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
+; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
; X64-NEXT: vorps %ymm3, %ymm2, %ymm2
; X64-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vorps %ymm0, %ymm2, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -844,9 +844,9 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X86-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X86-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-AVX2-NEXT: retl
;
@@ -862,9 +862,9 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vorps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X64-AVX2-NEXT: vorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
;
@@ -875,9 +875,9 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X86-AVX512-NEXT: korw %k1, %k0, %k0
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k1
+; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
; X86-AVX512-NEXT: korw %k1, %k0, %k0
-; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
-; X86-AVX512-NEXT: korw %k1, %k0, %k0
+; X86-AVX512-NEXT: korw %k2, %k0, %k0
; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
; X86-AVX512-NEXT: korw %k1, %k0, %k1
; X86-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -891,9 +891,9 @@ define <8 x i32> @five_or(<8 x float> %x) {
; X64-AVX512-NEXT: korw %k1, %k0, %k0
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k1
+; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2
; X64-AVX512-NEXT: korw %k1, %k0, %k0
-; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1
-; X64-AVX512-NEXT: korw %k1, %k0, %k0
+; X64-AVX512-NEXT: korw %k2, %k0, %k0
; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1
; X64-AVX512-NEXT: korw %k1, %k0, %k1
; X64-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -999,9 +999,9 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vandps %ymm2, %ymm1, %ymm1
; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X86-NEXT: vandps %ymm3, %ymm0, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
@@ -1012,9 +1012,9 @@ define <8 x i32> @four_or_and(<8 x float> %x) {
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vandps %ymm2, %ymm1, %ymm1
; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm3, %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -1087,30 +1087,30 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
; X86-LABEL: five_or_and:
; X86: ## %bb.0: ## %entry
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
+; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X86-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X86-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
+; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X86-NEXT: vorps %ymm1, %ymm2, %ymm1
+; X86-NEXT: vandps %ymm0, %ymm3, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
; X64-LABEL: five_or_and:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vandps %ymm3, %ymm2, %ymm2
-; X64-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vorps %ymm1, %ymm2, %ymm1
+; X64-NEXT: vandps %ymm0, %ymm3, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -1123,9 +1123,9 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
; X86-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X86-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X86-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm2
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X86-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
@@ -1141,9 +1141,9 @@ define <8 x i32> @five_or_and(<8 x float> %x) {
; X64-AVX2-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vandps %ymm3, %ymm2, %ymm2
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
; X64-AVX2-NEXT: vorps %ymm1, %ymm2, %ymm1
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1,1.00000001E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm2
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vandps %ymm0, %ymm2, %ymm0
@@ -1199,9 +1199,9 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
; X86-NEXT: vxorps %ymm2, %ymm1, %ymm1
; X86-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm0
-; X86-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X86-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm3
+; X86-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X86-NEXT: vandps %ymm3, %ymm0, %ymm0
; X86-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-NEXT: retl
;
@@ -1212,9 +1212,9 @@ define <8 x i32> @four_or_and_xor(<8 x float> %x) {
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
; X64-NEXT: vxorps %ymm2, %ymm1, %ymm1
; X64-NEXT: vxorps %xmm2, %xmm2, %xmm2
-; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm2
-; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm0
-; X64-NEXT: vandps %ymm0, %ymm2, %ymm0
+; X64-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3
+; X64-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X64-NEXT: vandps %ymm3, %ymm0, %ymm0
; X64-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-NEXT: retq
;
@@ -1289,8 +1289,8 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
; X86-LABEL: five_or_and_xor:
; X86: ## %bb.0: ## %entry
; X86-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}, %ymm0, %ymm2
+; X86-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X86-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X86-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X86-NEXT: vxorps %ymm3, %ymm2, %ymm2
@@ -1304,8 +1304,8 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
; X64-LABEL: five_or_and_xor:
; X64: ## %bb.0: ## %entry
; X64-NEXT: vbroadcastss {{.*#+}} ymm1 = [-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1,-5.0E-1]
-; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2
+; X64-NEXT: vcmpleps %ymm0, %ymm1, %ymm1
; X64-NEXT: vxorps %xmm3, %xmm3, %xmm3
; X64-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-NEXT: vxorps %ymm3, %ymm2, %ymm2
@@ -1358,9 +1358,9 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
; X86-AVX512-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k1
; X86-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X86-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k2
+; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k3
; X86-AVX512-NEXT: kxorw %k2, %k1, %k1
-; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2
-; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2 {%k2}
+; X86-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}{1to8}, %ymm0, %k2 {%k3}
; X86-AVX512-NEXT: kxorw %k2, %k1, %k1
; X86-AVX512-NEXT: korw %k0, %k1, %k1
; X86-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -1373,9 +1373,9 @@ define <8 x i32> @five_or_and_xor(<8 x float> %x) {
; X64-AVX512-NEXT: vcmpltps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k1
; X64-AVX512-NEXT: vxorps %xmm1, %xmm1, %xmm1
; X64-AVX512-NEXT: vcmpneqps %ymm1, %ymm0, %k2
+; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k3
; X64-AVX512-NEXT: kxorw %k2, %k1, %k1
-; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2
-; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2 {%k2}
+; X64-AVX512-NEXT: vcmpneqps {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %k2 {%k3}
; X64-AVX512-NEXT: kxorw %k2, %k1, %k1
; X64-AVX512-NEXT: korw %k0, %k1, %k1
; X64-AVX512-NEXT: vpcmpeqd %ymm0, %ymm0, %ymm0
@@ -1444,9 +1444,9 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X86-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1
+; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
; X86-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X86-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
-; X86-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X86-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X86-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X86-AVX2-NEXT: retl
;
@@ -1465,9 +1465,9 @@ define <8 x i32> @six_or_and_xor(<8 x float> %x) {
; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1,2.00000003E-1]
; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm3
; X64-AVX2-NEXT: vxorps %ymm1, %ymm3, %ymm1
+; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm3 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
; X64-AVX2-NEXT: vxorps %ymm2, %ymm1, %ymm1
-; X64-AVX2-NEXT: vbroadcastss {{.*#+}} ymm2 = [4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1,4.00000006E-1]
-; X64-AVX2-NEXT: vcmpneqps %ymm2, %ymm0, %ymm0
+; X64-AVX2-NEXT: vcmpneqps %ymm3, %ymm0, %ymm0
; X64-AVX2-NEXT: vorps %ymm0, %ymm1, %ymm0
; X64-AVX2-NEXT: retq
;
diff --git a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
index 2936b55ef6ed4d..bd27ebb46afdcb 100644
--- a/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
+++ b/llvm/test/CodeGen/X86/vector-interleaved-store-i8-stride-5.ll
@@ -1,17 +1,17 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx | FileCheck %s --check-prefixes=AVX
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
-; RUN: llc < %s -mtriple=x86_64-- -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX2
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FP
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx2,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX2-FCP
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl | FileCheck %s --check-prefixes=AVX512
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512-FCP
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl,+avx512dq | FileCheck %s --check-prefixes=AVX512DQ
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl,+avx512dq,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-FCP
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl,+avx512bw | FileCheck %s --check-prefixes=AVX512BW
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512BW-FCP
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl,+avx512dq,+avx512bw | FileCheck %s --check-prefixes=AVX512DQ-BW
+; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64 -mattr=+avx512vl,+avx512dq,+avx512bw,+fast-variable-crosslane-shuffle,+fast-variable-perlane-shuffle | FileCheck %s --check-prefixes=AVX512DQ-BW-FCP
; These patterns are produced by LoopVectorizer for interleaved stores.
@@ -45,8 +45,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX-LABEL: store_i8_stride5_vf2:
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
-; AVX-NEXT: vmovdqa (%rdx), %xmm1
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX-NEXT: vmovdqa (%rdx), %xmm1
; AVX-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -58,8 +58,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-LABEL: store_i8_stride5_vf2:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -71,8 +71,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FP-LABEL: store_i8_stride5_vf2:
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -84,8 +84,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FCP-LABEL: store_i8_stride5_vf2:
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -97,8 +97,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-LABEL: store_i8_stride5_vf2:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -110,8 +110,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-FCP-LABEL: store_i8_stride5_vf2:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -123,8 +123,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-LABEL: store_i8_stride5_vf2:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -136,8 +136,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-FCP-LABEL: store_i8_stride5_vf2:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512DQ-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -149,8 +149,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-LABEL: store_i8_stride5_vf2:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -162,8 +162,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-LABEL: store_i8_stride5_vf2:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -175,8 +175,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-LABEL: store_i8_stride5_vf2:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512DQ-BW-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -188,8 +188,8 @@ define void @store_i8_stride5_vf2(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf2:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpunpcklwd {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3]
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1]
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1],xmm0[2],mem[2],xmm0[3],mem[3],xmm0[4],mem[4],xmm0[5],mem[5],xmm0[6],mem[6],xmm0[7],mem[7]
@@ -217,9 +217,9 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; SSE: # %bb.0:
; SSE-NEXT: movdqa (%rdi), %xmm1
; SSE-NEXT: movdqa (%rdx), %xmm2
-; SSE-NEXT: movdqa (%r8), %xmm0
; SSE-NEXT: punpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; SSE-NEXT: punpckldq {{.*#+}} xmm2 = xmm2[0],mem[0],xmm2[1],mem[1]
+; SSE-NEXT: movdqa (%r8), %xmm0
; SSE-NEXT: pxor %xmm3, %xmm3
; SSE-NEXT: punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm3[0],xmm1[1],xmm3[1],xmm1[2],xmm3[2],xmm1[3],xmm3[3],xmm1[4],xmm3[4],xmm1[5],xmm3[5],xmm1[6],xmm3[6],xmm1[7],xmm3[7]
; SSE-NEXT: pshufd {{.*#+}} xmm4 = xmm1[3,1,2,1]
@@ -264,9 +264,9 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX: # %bb.0:
; AVX-NEXT: vmovdqa (%rdi), %xmm0
; AVX-NEXT: vmovdqa (%rdx), %xmm1
-; AVX-NEXT: vmovdqa (%r8), %xmm2
; AVX-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
; AVX-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
+; AVX-NEXT: vmovdqa (%r8), %xmm2
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,4,8,12],zero,xmm0[1,5,9,13],zero,xmm0[2,6,10,14],zero,xmm0[3]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero
@@ -280,8 +280,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-LABEL: store_i8_stride5_vf4:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -298,8 +298,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FP-LABEL: store_i8_stride5_vf4:
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -316,8 +316,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FCP-LABEL: store_i8_stride5_vf4:
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX2-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -334,8 +334,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-LABEL: store_i8_stride5_vf4:
; AVX512: # %bb.0:
; AVX512-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -352,8 +352,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-FCP-LABEL: store_i8_stride5_vf4:
; AVX512-FCP: # %bb.0:
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -370,8 +370,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-LABEL: store_i8_stride5_vf4:
; AVX512DQ: # %bb.0:
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -388,8 +388,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-FCP-LABEL: store_i8_stride5_vf4:
; AVX512DQ-FCP: # %bb.0:
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -406,8 +406,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-LABEL: store_i8_stride5_vf4:
; AVX512BW: # %bb.0:
; AVX512BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -424,8 +424,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-LABEL: store_i8_stride5_vf4:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -442,8 +442,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-LABEL: store_i8_stride5_vf4:
; AVX512DQ-BW: # %bb.0:
; AVX512DQ-BW-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512DQ-BW-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-BW-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -460,8 +460,8 @@ define void @store_i8_stride5_vf4(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf4:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm0 = xmm0[0],mem[0],xmm0[1],mem[1]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %xmm1
; AVX512DQ-BW-FCP-NEXT: vpunpckldq {{.*#+}} xmm1 = xmm1[0],mem[0],xmm1[1],mem[1]
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, (%r8), %ymm0, %ymm0
@@ -592,8 +592,8 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX-NEXT: vpor %xmm4, %xmm5, %xmm4
; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[0,1,2,3],zero,xmm4[5,6,7,8],zero,xmm4[10,11,12,13],zero,xmm4[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,zero,xmm2[0],zero,zero,zero,zero,xmm2[1],zero,zero,zero,zero,xmm2[2],zero
-; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[3,11,u],zero,zero,xmm1[4,12,u],zero,zero,xmm1[5,13,u],zero,zero
+; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[11],zero,zero,xmm0[u,4,12],zero,zero,xmm0[u,5,13],zero,zero,xmm0[u,6,14]
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[0,1,2],zero,xmm0[4,5,6,7],zero,xmm0[9,10,11,12],zero,xmm0[14,15]
@@ -614,12 +614,12 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-NEXT: vmovq %rax, %xmm3
-; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-NEXT: vmovq %rax, %xmm4
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
-; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; AVX2-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1]
; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
; AVX2-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
@@ -646,12 +646,12 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-FP-NEXT: vmovq %rax, %xmm3
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-FP-NEXT: vmovq %rax, %xmm4
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
-; AVX2-FP-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; AVX2-FP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
; AVX2-FP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
@@ -678,13 +678,13 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX2-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX2-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX2-FCP-NEXT: vmovq %rax, %xmm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX2-FCP-NEXT: vmovq %rax, %xmm4
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
-; AVX2-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX2-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1]
+; AVX2-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm3, %ymm2
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
@@ -702,7 +702,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512-LABEL: store_i8_stride5_vf8:
; AVX512: # %bb.0:
-; AVX512-NEXT: movq (%r8), %rax
; AVX512-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -710,12 +709,13 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512-NEXT: vmovq %rax, %xmm3
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512-NEXT: movq (%r8), %rax
; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
-; AVX512-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; AVX512-NEXT: vmovq %rax, %xmm4
+; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1]
; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
; AVX512-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
@@ -733,7 +733,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512-FCP-LABEL: store_i8_stride5_vf8:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: movq (%r8), %rax
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -741,13 +740,14 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512-FCP-NEXT: vmovq %rax, %xmm3
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512-FCP-NEXT: movq (%r8), %rax
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
-; AVX512-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512-FCP-NEXT: vmovq %rax, %xmm4
+; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1]
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
@@ -764,7 +764,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512DQ-LABEL: store_i8_stride5_vf8:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: movq (%r8), %rax
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -772,12 +771,13 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512DQ-NEXT: vmovq %rax, %xmm3
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512DQ-NEXT: movq (%r8), %rax
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
-; AVX512DQ-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; AVX512DQ-NEXT: vmovq %rax, %xmm4
+; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
@@ -795,7 +795,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512DQ-FCP-LABEL: store_i8_stride5_vf8:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: movq (%r8), %rax
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -803,13 +802,14 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512DQ-FCP-NEXT: vmovq %rax, %xmm3
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,ymm2[u,1,9],zero,zero,ymm2[u,2,10],zero,zero,ymm2[u,3],zero,ymm2[19,27,u],zero,zero,ymm2[20,28,u],zero,zero,ymm2[21,29,u],zero,zero
+; AVX512DQ-FCP-NEXT: movq (%r8), %rax
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8,u],zero,zero,ymm2[1,9,u],zero,zero,ymm2[2,10,u],zero,ymm2[27],zero,zero,ymm2[u,20,28],zero,zero,ymm2[u,21,29],zero,zero,ymm2[u,22,30]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
-; AVX512DQ-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-FCP-NEXT: vmovq %rax, %xmm4
+; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm3 = ymm3 ^ (mem & (ymm3 ^ ymm2))
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
@@ -826,7 +826,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512BW-LABEL: store_i8_stride5_vf8:
; AVX512BW: # %bb.0:
-; AVX512BW-NEXT: movq (%r8), %rax
; AVX512BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -834,17 +833,18 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512BW-NEXT: vmovq %rax, %xmm3
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512BW-NEXT: movq (%r8), %rax
; AVX512BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
-; AVX512BW-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; AVX512BW-NEXT: vmovq %rax, %xmm4
+; AVX512BW-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1]
; AVX512BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX512BW-NEXT: movl $554189328, %ecx # imm = 0x21084210
; AVX512BW-NEXT: kmovd %ecx, %k1
-; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
+; AVX512BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
; AVX512BW-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512BW-NEXT: shrq $48, %rax
@@ -860,7 +860,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512BW-FCP-LABEL: store_i8_stride5_vf8:
; AVX512BW-FCP: # %bb.0:
-; AVX512BW-FCP-NEXT: movq (%r8), %rax
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -868,17 +867,18 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512BW-FCP-NEXT: vmovq %rax, %xmm3
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512BW-FCP-NEXT: movq (%r8), %rax
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
-; AVX512BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
-; AVX512BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512BW-FCP-NEXT: vmovq %rax, %xmm4
+; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1]
+; AVX512BW-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3
; AVX512BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210
; AVX512BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
+; AVX512BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
; AVX512BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512BW-FCP-NEXT: shrq $48, %rax
@@ -894,7 +894,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512DQ-BW-LABEL: store_i8_stride5_vf8:
; AVX512DQ-BW: # %bb.0:
-; AVX512DQ-BW-NEXT: movq (%r8), %rax
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -902,17 +901,18 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-BW-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512DQ-BW-NEXT: vmovq %rax, %xmm3
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512DQ-BW-NEXT: movq (%r8), %rax
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
-; AVX512DQ-BW-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm3[0,0,1,1]
+; AVX512DQ-BW-NEXT: vmovq %rax, %xmm4
+; AVX512DQ-BW-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512DQ-BW-NEXT: vpshufd {{.*#+}} xmm3 = xmm4[0,0,1,1]
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,0,1]
; AVX512DQ-BW-NEXT: movl $554189328, %ecx # imm = 0x21084210
; AVX512DQ-BW-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-BW-NEXT: shrq $48, %rax
@@ -928,7 +928,6 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
;
; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf8:
; AVX512DQ-BW-FCP: # %bb.0:
-; AVX512DQ-BW-FCP-NEXT: movq (%r8), %rax
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm0 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm1 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
@@ -936,17 +935,18 @@ define void @store_i8_stride5_vf8(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vecp
; AVX512DQ-BW-FCP-NEXT: vmovq {{.*#+}} xmm2 = mem[0],zero
; AVX512DQ-BW-FCP-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
; AVX512DQ-BW-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vmovq %rax, %xmm3
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,zero,ymm2[3],zero,ymm2[19,27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero
+; AVX512DQ-BW-FCP-NEXT: movq (%r8), %rax
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,0,1]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,ymm2[0,8],zero,zero,zero,ymm2[1,9],zero,zero,zero,ymm2[2,10],zero,zero,ymm2[27],zero,zero,zero,ymm2[20,28],zero,zero,zero,ymm2[21,29],zero,zero,zero,ymm2[22,30]
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm4, %ymm2, %ymm2
-; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
-; AVX512DQ-BW-FCP-NEXT: vpermd %ymm3, %ymm4, %ymm3
+; AVX512DQ-BW-FCP-NEXT: vmovq %rax, %xmm4
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm2, %ymm2
+; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [0,0,0,0,0,0,1,1]
+; AVX512DQ-BW-FCP-NEXT: vpermd %ymm4, %ymm3, %ymm3
; AVX512DQ-BW-FCP-NEXT: movl $554189328, %ecx # imm = 0x21084210
; AVX512DQ-BW-FCP-NEXT: kmovd %ecx, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,zero,xmm0[u,7,15],zero,zero,xmm0[u,u,u,u,u,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %ymm3, %ymm2 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[6,14,u],zero,zero,xmm1[7,15,u,u,u,u,u,u,u,u,u]
; AVX512DQ-BW-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX512DQ-BW-FCP-NEXT: shrq $48, %rax
@@ -1136,97 +1136,97 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX-LABEL: store_i8_stride5_vf16:
; AVX: # %bb.0:
-; AVX-NEXT: vmovdqa (%rdi), %xmm2
-; AVX-NEXT: vmovdqa (%rsi), %xmm3
+; AVX-NEXT: vmovdqa (%rdi), %xmm4
+; AVX-NEXT: vmovdqa (%rsi), %xmm5
; AVX-NEXT: vmovdqa (%rdx), %xmm1
-; AVX-NEXT: vmovdqa (%rcx), %xmm4
+; AVX-NEXT: vmovdqa (%rcx), %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm2[6,u,u,u],zero,xmm2[7,u,u,u],zero,xmm2[8,u,u,u],zero
; AVX-NEXT: vmovdqa (%r8), %xmm0
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm4[6,u,u,u],zero,xmm4[7,u,u,u],zero,xmm4[8,u,u,u],zero
; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm1[6],zero,xmm1[u,u,u,7],zero,xmm1[u,u,u,8],zero,xmm1[u,u,u,9]
-; AVX-NEXT: vpor %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm3[u,u,u],zero,xmm3[7,u,u,u],zero,xmm3[8,u,u,u],zero,xmm3[9,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm2[u,u,u,7],zero,xmm2[u,u,u,8],zero,xmm2[u,u,u,9],zero,xmm2[u]
+; AVX-NEXT: vpor %xmm3, %xmm6, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[u,u,u],zero,xmm5[7,u,u,u],zero,xmm5[8,u,u,u],zero,xmm5[9,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = xmm4[u,u,u,7],zero,xmm4[u,u,u,8],zero,xmm4[u,u,u,9],zero,xmm4[u]
; AVX-NEXT: vpor %xmm6, %xmm7, %xmm6
; AVX-NEXT: vmovdqa {{.*#+}} xmm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255]
-; AVX-NEXT: vpblendvb %xmm7, %xmm5, %xmm6, %xmm5
-; AVX-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1],zero,xmm5[3,4,5,6],zero,xmm5[8,9,10,11],zero,xmm5[13,14,15]
+; AVX-NEXT: vpblendvb %xmm7, %xmm3, %xmm6, %xmm3
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1],zero,xmm3[3,4,5,6],zero,xmm3[8,9,10,11],zero,xmm3[13,14,15]
; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,xmm0[6],zero,zero,zero,zero,xmm0[7],zero,zero,zero,zero,xmm0[8],zero,zero,zero
-; AVX-NEXT: vpor %xmm6, %xmm5, %xmm5
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm4[8],xmm1[9],xmm4[9],xmm1[10],xmm4[10],xmm1[11],xmm4[11],xmm1[12],xmm4[12],xmm1[13],xmm4[13],xmm1[14],xmm4[14],xmm1[15],xmm4[15]
+; AVX-NEXT: vpor %xmm6, %xmm3, %xmm3
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm6 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm6 = zero,zero,zero,xmm6[10,11],zero,zero,zero,xmm6[12,13],zero,zero,zero,xmm6[14,15],zero
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm7 = xmm4[8],xmm5[8],xmm4[9],xmm5[9],xmm4[10],xmm5[10],xmm4[11],xmm5[11],xmm4[12],xmm5[12],xmm4[13],xmm5[13],xmm4[14],xmm5[14],xmm4[15],xmm5[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[10,11],zero,zero,zero,xmm7[12,13],zero,zero,zero,xmm7[14,15],zero,zero,zero
; AVX-NEXT: vpor %xmm6, %xmm8, %xmm6
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm4[0],xmm1[1],xmm4[1],xmm1[2],xmm4[2],xmm1[3],xmm4[3],xmm1[4],xmm4[4],xmm1[5],xmm4[5],xmm1[6],xmm4[6],xmm1[7],xmm4[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm1[0],xmm2[0],xmm1[1],xmm2[1],xmm1[2],xmm2[2],xmm1[3],xmm2[3],xmm1[4],xmm2[4],xmm1[5],xmm2[5],xmm1[6],xmm2[6],xmm1[7],xmm2[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm9 = zero,zero,xmm8[0,1],zero,zero,zero,xmm8[2,3],zero,zero,zero,xmm8[4,5],zero,zero
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm2[0],xmm3[0],xmm2[1],xmm3[1],xmm2[2],xmm3[2],xmm2[3],xmm3[3],xmm2[4],xmm3[4],xmm2[5],xmm3[5],xmm2[6],xmm3[6],xmm2[7],xmm3[7]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm4[0],xmm5[0],xmm4[1],xmm5[1],xmm4[2],xmm5[2],xmm4[3],xmm5[3],xmm4[4],xmm5[4],xmm4[5],xmm5[5],xmm4[6],xmm5[6],xmm4[7],xmm5[7]
; AVX-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[0,1],zero,zero,zero,xmm10[2,3],zero,zero,zero,xmm10[4,5],zero,zero,zero,xmm10[6]
+; AVX-NEXT: vpshufb {{.*#+}} xmm11 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
; AVX-NEXT: vpor %xmm9, %xmm10, %xmm9
-; AVX-NEXT: vpshufb {{.*#+}} xmm10 = zero,zero,zero,zero,xmm0[0],zero,zero,zero,zero,xmm0[1],zero,zero,zero,zero,xmm0[2],zero
-; AVX-NEXT: vpor %xmm10, %xmm9, %xmm9
+; AVX-NEXT: vpor %xmm11, %xmm9, %xmm9
; AVX-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6,7],zero,zero,zero,xmm8[8,9],zero,zero,zero,xmm8[10,11],zero,zero,zero
-; AVX-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; AVX-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[6],zero,zero,zero,xmm2[9,8],zero,zero,zero,xmm2[11,10],zero,zero,zero,xmm2[13,12]
-; AVX-NEXT: vpor %xmm2, %xmm8, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero
-; AVX-NEXT: vpor %xmm3, %xmm2, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero
-; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm4[8],xmm1[8],xmm4[9],xmm1[9],xmm4[10],xmm1[10],xmm4[11],xmm1[11],xmm4[12],xmm1[12],xmm4[13],xmm1[13],xmm4[14],xmm1[14],xmm4[15],xmm1[15]
+; AVX-NEXT: vpunpcklbw {{.*#+}} xmm4 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm4[6],zero,zero,zero,xmm4[9,8],zero,zero,zero,xmm4[11,10],zero,zero,zero,xmm4[13,12]
+; AVX-NEXT: vpor %xmm4, %xmm8, %xmm4
+; AVX-NEXT: vpshufb {{.*#+}} xmm5 = zero,zero,zero,xmm0[3],zero,zero,zero,zero,xmm0[4],zero,zero,zero,zero,xmm0[5],zero,zero
+; AVX-NEXT: vpshufb {{.*#+}} xmm7 = zero,zero,xmm7[4,5],zero,zero,zero,xmm7[6,7],zero,zero,zero,xmm7[8,9],zero,zero
+; AVX-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm2[8],xmm1[8],xmm2[9],xmm1[9],xmm2[10],xmm1[10],xmm2[11],xmm1[11],xmm2[12],xmm1[12],xmm2[13],xmm1[13],xmm2[14],xmm1[14],xmm2[15],xmm1[15]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[2],zero,zero,zero,xmm1[5,4],zero,zero,zero,xmm1[7,6],zero,zero,zero,xmm1[9,8]
-; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
-; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
+; AVX-NEXT: vpor %xmm7, %xmm1, %xmm1
+; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm0[9],zero,zero,zero,zero,xmm0[10],zero,zero,zero,zero,xmm0[11],zero,zero,zero,zero
+; AVX-NEXT: vpor %xmm2, %xmm1, %xmm1
; AVX-NEXT: vmovdqa %xmm1, 48(%r9)
-; AVX-NEXT: vmovdqa %xmm2, 16(%r9)
-; AVX-NEXT: vmovdqa %xmm9, (%r9)
+; AVX-NEXT: vmovdqa %xmm4, 16(%r9)
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
+; AVX-NEXT: vmovdqa %xmm9, (%r9)
; AVX-NEXT: vpor %xmm0, %xmm6, %xmm0
; AVX-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX-NEXT: vmovdqa %xmm5, 32(%r9)
+; AVX-NEXT: vmovdqa %xmm3, 32(%r9)
; AVX-NEXT: retq
;
; AVX2-LABEL: store_i8_stride5_vf16:
; AVX2: # %bb.0:
; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-NEXT: vmovdqa (%rdx), %xmm3
-; AVX2-NEXT: vmovdqa (%rcx), %xmm4
-; AVX2-NEXT: vmovdqa (%r8), %xmm2
-; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28]
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero
-; AVX2-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
+; AVX2-NEXT: vmovdqa (%rdx), %xmm2
+; AVX2-NEXT: vmovdqa (%rcx), %xmm3
+; AVX2-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9,25],zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28]
+; AVX2-NEXT: vmovdqa (%r8), %xmm6
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[6],zero,zero,zero,zero,ymm7[7],zero,zero,zero,zero,ymm7[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[26],zero,zero,zero,zero,ymm7[27],zero,zero,zero,zero,ymm7[28],zero
+; AVX2-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX2-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero
-; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX2-NEXT: vpor %ymm7, %ymm9, %ymm7
; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
+; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX2-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22]
-; AVX2-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX2-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
-; AVX2-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
+; AVX2-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm8[0,2,2,0]
+; AVX2-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,zero,ymm7[1,9],zero,zero,zero,ymm7[2,10],zero,zero,zero,ymm7[3,19],zero,zero,zero,ymm7[28,20],zero,zero,zero,ymm7[29,21],zero,zero,zero,ymm7[30,22]
+; AVX2-NEXT: vpor %ymm4, %ymm7, %ymm4
+; AVX2-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1]
+; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX2-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
+; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4
+; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX2-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
; AVX2-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX2-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
+; AVX2-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX2-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[12],zero,zero,zero,zero,xmm6[13],zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,xmm6[15]
; AVX2-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX2-NEXT: vmovdqa %ymm5, (%r9)
-; AVX2-NEXT: vmovdqa %ymm7, 32(%r9)
+; AVX2-NEXT: vmovdqa %ymm4, (%r9)
+; AVX2-NEXT: vmovdqa %ymm5, 32(%r9)
; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
@@ -1234,86 +1234,86 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP: # %bb.0:
; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3
-; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm4
-; AVX2-FP-NEXT: vmovdqa (%r8), %xmm2
-; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-FP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm6[6],zero,zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9,25],zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero
-; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9],zero,zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28],zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
+; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm2
+; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm3
+; AVX2-FP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm4
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[6],zero,zero,zero,zero,ymm4[7],zero,zero,zero,zero,ymm4[8],zero,zero,zero,zero,ymm4[9,25],zero,zero,zero,zero,ymm4[26],zero,zero,zero,zero,ymm4[27],zero,zero,zero,zero,ymm4[28]
+; AVX2-FP-NEXT: vmovdqa (%r8), %xmm6
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm4[2,3,0,1]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,ymm7[6],zero,zero,zero,zero,ymm7[7],zero,zero,zero,zero,ymm7[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm7[26],zero,zero,zero,zero,ymm7[27],zero,zero,zero,zero,ymm7[28],zero
+; AVX2-FP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm8
+; AVX2-FP-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm8[2,3,0,1]
; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,ymm9[9],zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX2-FP-NEXT: vpor %ymm7, %ymm9, %ymm7
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm8 = xmm2[1,1,2,2]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,1,1,1]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[1,1,2,2]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,2,0,2]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,zero,ymm5[1,9],zero,zero,zero,ymm5[2,10],zero,zero,zero,ymm5[3,19],zero,zero,zero,ymm5[28,20],zero,zero,zero,ymm5[29,21],zero,zero,zero,ymm5[30,22]
-; AVX2-FP-NEXT: vpor %ymm6, %ymm5, %ymm5
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm6 = xmm2[0,0,1,1]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,0,1]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm5, %ymm7, %ymm5
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,0,2]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = zero,zero,ymm4[0,8],zero,zero,zero,ymm4[1,9],zero,zero,zero,ymm4[2,10],zero,zero,zero,ymm4[19,27],zero,zero,zero,ymm4[20,28],zero,zero,zero,ymm4[21,29],zero,zero,zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm8[0,2,2,0]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm7[0,8],zero,zero,zero,ymm7[1,9],zero,zero,zero,ymm7[2,10],zero,zero,zero,ymm7[3,19],zero,zero,zero,ymm7[28,20],zero,zero,zero,ymm7[29,21],zero,zero,zero,ymm7[30,22]
+; AVX2-FP-NEXT: vpor %ymm4, %ymm7, %ymm4
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm6[0,0,1,1]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,0,1]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
+; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm7, %ymm4
+; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
; AVX2-FP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX2-FP-NEXT: vpor %xmm3, %xmm0, %xmm0
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm2[12],zero,zero,zero,zero,xmm2[13],zero,zero,zero,zero,xmm2[14],zero,zero,zero,zero,xmm2[15]
+; AVX2-FP-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm1 = xmm6[12],zero,zero,zero,zero,xmm6[13],zero,zero,zero,zero,xmm6[14],zero,zero,zero,zero,xmm6[15]
; AVX2-FP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-FP-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm7, 32(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm4, (%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm5, 32(%r9)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: store_i8_stride5_vf16:
; AVX2-FCP: # %bb.0:
; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm0
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm1
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm2
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm3
-; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm4
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm5
-; AVX2-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm6
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8],zero,zero,zero,ymm7[1,9],zero,zero,zero,ymm7[2,10],zero,zero,zero,ymm7[19,27],zero,zero,zero,ymm7[20,28],zero,zero,zero,ymm7[21,29],zero,zero,zero
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm5[0,2,2,0]
+; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm2
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm3
+; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm4
+; AVX2-FCP-NEXT: vmovdqa (%r8), %xmm1
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm5
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[0,2,0,2]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8],zero,zero,zero,ymm6[1,9],zero,zero,zero,ymm6[2,10],zero,zero,zero,ymm6[19,27],zero,zero,zero,ymm6[20,28],zero,zero,zero,ymm6[21,29],zero,zero,zero
+; AVX2-FCP-NEXT: vinserti128 $1, %xmm2, %ymm0, %ymm7
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm7[0,2,2,0]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm8[0,8],zero,zero,zero,ymm8[1,9],zero,zero,zero,ymm8[2,10],zero,zero,zero,ymm8[3,19],zero,zero,zero,ymm8[28,20],zero,zero,zero,ymm8[29,21],zero,zero,zero,ymm8[30,22]
-; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [0,0,0,0,0,0,1,1]
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = zero,zero,zero,ymm5[3,7],zero,zero,zero,ymm5[8,12],zero,zero,zero,ymm5[9,13],zero,zero,zero,ymm5[18,22],zero,zero,zero,ymm5[19,23],zero,zero,zero,ymm5[24,28],zero,zero
+; AVX2-FCP-NEXT: vpermd %ymm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm7[3,7],zero,zero,zero,ymm7[8,12],zero,zero,zero,ymm7[9,13],zero,zero,zero,ymm7[18,22],zero,zero,zero,ymm7[19,23],zero,zero,zero,ymm7[24,28],zero,zero
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7]
-; AVX2-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6],zero,zero,zero,ymm6[3,7],zero,zero,zero,ymm6[8,12],zero,zero,zero,ymm6[9,17],zero,zero,zero,ymm6[22,18],zero,zero,zero,ymm6[23,19],zero,zero,zero,ymm6[24,28]
-; AVX2-FCP-NEXT: vpor %ymm5, %ymm6, %ymm5
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2]
-; AVX2-FCP-NEXT: vpermd %ymm4, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6],zero,zero,zero,ymm5[3,7],zero,zero,zero,ymm5[8,12],zero,zero,zero,ymm5[9,17],zero,zero,zero,ymm5[22,18],zero,zero,zero,ymm5[23,19],zero,zero,zero,ymm5[24,28]
+; AVX2-FCP-NEXT: vpor %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm7
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm6, %ymm5
-; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
-; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm5, %ymm7, %ymm5
+; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,xmm3[10,11],zero,zero,zero,xmm3[12,13],zero,zero,zero,xmm3[14,15],zero
+; AVX2-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm2[8],xmm0[9],xmm2[9],xmm0[10],xmm2[10],xmm0[11],xmm2[11],xmm0[12],xmm2[12],xmm0[13],xmm2[13],xmm0[14],xmm2[14],xmm0[15],xmm2[15]
; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
-; AVX2-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
+; AVX2-FCP-NEXT: vpor %xmm3, %xmm0, %xmm0
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[12],zero,zero,zero,zero,xmm1[13],zero,zero,zero,zero,xmm1[14],zero,zero,zero,zero,xmm1[15]
; AVX2-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX2-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm7, (%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm6, (%r9)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
@@ -1360,40 +1360,40 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-FCP-LABEL: store_i8_stride5_vf16:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm4
-; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm0
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
-; AVX512-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX512-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22]
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7]
-; AVX512-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28]
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6
-; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
-; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u]
-; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u]
-; AVX512-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
-; AVX512-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[0,2,0,2]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u],zero,zero,ymm6[1,9,u],zero,zero,ymm6[2,10,u],zero,zero,ymm6[19,27,u],zero,zero,ymm6[20,28,u],zero,zero,ymm6[21,29,u],zero,zero
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,2,6,3,7]
+; AVX512-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm7
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[u,3,7],zero,zero,ymm7[u,8,12],zero,zero,ymm7[u,9,13],zero,zero,ymm7[u,18,22],zero,zero,ymm7[u,19,23],zero,zero,ymm7[u,24,28],zero,zero
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,0]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7]
+; AVX512-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28]
+; AVX512-FCP-NEXT: vmovdqa (%r8), %xmm7
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vporq %zmm6, %zmm4, %zmm4
+; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm5
+; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm5
+; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4))
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[10,11,u],zero,zero,xmm2[12,13,u],zero,zero,xmm2[14,15,u]
+; AVX512-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u]
+; AVX512-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15]
+; AVX512-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX512-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
; AVX512-FCP-NEXT: vzeroupper
; AVX512-FCP-NEXT: retq
;
@@ -1440,40 +1440,40 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-FCP-LABEL: store_i8_stride5_vf16:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm1
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm2
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm3
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm4
-; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm0
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm5
-; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm4, %ymm3, %ymm6
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm6[0,2,0,2]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[0,8,u],zero,zero,ymm7[1,9,u],zero,zero,ymm7[2,10,u],zero,zero,ymm7[19,27,u],zero,zero,ymm7[20,28,u],zero,zero,ymm7[21,29,u],zero,zero
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,2,6,3,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm8, %ymm8
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,ymm8[u,3,7],zero,zero,ymm8[u,8,12],zero,zero,ymm8[u,9,13],zero,zero,ymm8[u,18,22],zero,zero,ymm8[u,19,23],zero,zero,ymm8[u,24,28],zero,zero
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,2,2,0]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[0,8],zero,zero,ymm5[u,1,9],zero,zero,ymm5[u,2,10],zero,zero,ymm5[u,3,19],zero,zero,ymm5[u,28,20],zero,zero,ymm5[u,29,21],zero,zero,ymm5[u,30,22]
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [1,5,2,6,6,2,3,7]
-; AVX512DQ-FCP-NEXT: vpermd %ymm6, %ymm8, %ymm6
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = ymm6[2,6,u],zero,zero,ymm6[3,7,u],zero,zero,ymm6[8,12,u],zero,zero,ymm6[9,17,u],zero,zero,ymm6[22,18,u],zero,zero,ymm6[23,19,u],zero,zero,ymm6[24,28]
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm6, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vporq %zmm7, %zmm5, %zmm5
-; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm0, %zmm0, %zmm6
-; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
-; AVX512DQ-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
-; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm3 = xmm3[8],xmm4[8],xmm3[9],xmm4[9],xmm3[10],xmm4[10],xmm3[11],xmm4[11],xmm3[12],xmm4[12],xmm3[13],xmm4[13],xmm3[14],xmm4[14],xmm3[15],xmm4[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[u],zero,zero,xmm3[10,11,u],zero,zero,xmm3[12,13,u],zero,zero,xmm3[14,15,u]
-; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm1 = xmm1[8],xmm2[8],xmm1[9],xmm2[9],xmm1[10],xmm2[10],xmm1[11],xmm2[11],xmm1[12],xmm2[12],xmm1[13],xmm2[13],xmm1[14],xmm2[14],xmm1[15],xmm2[15]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm1[u,10,11],zero,zero,xmm1[u,12,13],zero,zero,xmm1[u,14,15],zero,zero,xmm1[u]
-; AVX512DQ-FCP-NEXT: vpor %xmm3, %xmm1, %xmm1
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = zero,xmm1[1,2,3,4],zero,xmm1[6,7,8,9],zero,xmm1[11,12,13,14],zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[12],zero,zero,zero,zero,xmm0[13],zero,zero,zero,zero,xmm0[14],zero,zero,zero,zero,xmm0[15]
-; AVX512DQ-FCP-NEXT: vpor %xmm0, %xmm1, %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm0
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm2
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm3
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm1, %ymm0, %ymm4
+; AVX512DQ-FCP-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm5
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm5[0,2,0,2]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm6[0,8,u],zero,zero,ymm6[1,9,u],zero,zero,ymm6[2,10,u],zero,zero,ymm6[19,27,u],zero,zero,ymm6[20,28,u],zero,zero,ymm6[21,29,u],zero,zero
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,2,6,3,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm7, %ymm7
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm7[u,3,7],zero,zero,ymm7[u,8,12],zero,zero,ymm7[u,9,13],zero,zero,ymm7[u,18,22],zero,zero,ymm7[u,19,23],zero,zero,ymm7[u,24,28],zero,zero
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,2,2,0]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[0,8],zero,zero,ymm4[u,1,9],zero,zero,ymm4[u,2,10],zero,zero,ymm4[u,3,19],zero,zero,ymm4[u,28,20],zero,zero,ymm4[u,29,21],zero,zero,ymm4[u,30,22]
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,5,2,6,6,2,3,7]
+; AVX512DQ-FCP-NEXT: vpermd %ymm5, %ymm7, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm5[2,6,u],zero,zero,ymm5[3,7,u],zero,zero,ymm5[8,12,u],zero,zero,ymm5[9,17,u],zero,zero,ymm5[22,18,u],zero,zero,ymm5[23,19,u],zero,zero,ymm5[24,28]
+; AVX512DQ-FCP-NEXT: vmovdqa (%r8), %xmm7
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm5, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vporq %zmm6, %zmm4, %zmm4
+; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm7, %zmm5
+; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm6 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
+; AVX512DQ-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm5
+; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm5 = zmm5 ^ (mem & (zmm5 ^ zmm4))
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm2 = xmm2[u],zero,zero,xmm2[10,11,u],zero,zero,xmm2[12,13,u],zero,zero,xmm2[14,15,u]
+; AVX512DQ-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[u,10,11],zero,zero,xmm0[u,12,13],zero,zero,xmm0[u,14,15],zero,zero,xmm0[u]
+; AVX512DQ-FCP-NEXT: vpor %xmm2, %xmm0, %xmm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[1,2,3,4],zero,xmm0[6,7,8,9],zero,xmm0[11,12,13,14],zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm7[12],zero,zero,zero,zero,xmm7[13],zero,zero,zero,zero,xmm7[14],zero,zero,zero,zero,xmm7[15]
+; AVX512DQ-FCP-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX512DQ-FCP-NEXT: vmovdqa %xmm0, 64(%r9)
-; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm6, (%r9)
+; AVX512DQ-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
; AVX512DQ-FCP-NEXT: vzeroupper
; AVX512DQ-FCP-NEXT: retq
;
@@ -1489,11 +1489,11 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero
; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28]
; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28]
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero
-; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero
+; AVX512BW-NEXT: vpor %ymm9, %ymm8, %ymm8
; AVX512BW-NEXT: movl $831283992, %eax # imm = 0x318C6318
; AVX512BW-NEXT: kmovd %eax, %k1
; AVX512BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1}
@@ -1507,12 +1507,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpermd %zmm4, %zmm6, %zmm6
; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512BW-NEXT: kmovq %rax, %k1
-; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
; AVX512BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
; AVX512BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
+; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
; AVX512BW-NEXT: vmovdqa %xmm1, 64(%r9)
; AVX512BW-NEXT: vmovdqa64 %zmm5, (%r9)
@@ -1544,12 +1544,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
; AVX512BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
; AVX512BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
; AVX512BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
; AVX512BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
@@ -1568,11 +1568,11 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,ymm6[7],zero,zero,zero,zero,ymm6[8],zero,zero,zero,zero,ymm6[9],zero,zero,zero,zero,zero,ymm6[26],zero,zero,zero,zero,ymm6[27],zero,zero,zero,zero,ymm6[28],zero,zero
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm6[2,3,0,1]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,ymm8[9],zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28]
; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[6],zero,zero,zero,zero,ymm5[7],zero,zero,zero,zero,ymm5[8],zero,zero,zero,zero,ymm5[9,25],zero,zero,zero,zero,ymm5[26],zero,zero,zero,zero,ymm5[27],zero,zero,zero,zero,ymm5[28]
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm9 = ymm5[2,3,0,1]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = zero,ymm9[6],zero,zero,zero,zero,ymm9[7],zero,zero,zero,zero,ymm9[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm9[26],zero,zero,zero,zero,ymm9[27],zero,zero,zero,zero,ymm9[28],zero
-; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm5[2,3,0,1]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,ymm8[6],zero,zero,zero,zero,ymm8[7],zero,zero,zero,zero,ymm8[8],zero,zero,zero,zero,zero,zero,zero,zero,ymm8[26],zero,zero,zero,zero,ymm8[27],zero,zero,zero,zero,ymm8[28],zero
+; AVX512DQ-BW-NEXT: vpor %ymm9, %ymm8, %ymm8
; AVX512DQ-BW-NEXT: movl $831283992, %eax # imm = 0x318C6318
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
; AVX512DQ-BW-NEXT: vmovdqu8 %ymm7, %ymm8 {%k1}
@@ -1586,12 +1586,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vpermd %zmm4, %zmm6, %zmm6
; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
; AVX512DQ-BW-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
+; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512DQ-BW-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
; AVX512DQ-BW-NEXT: vmovdqa %xmm1, 64(%r9)
; AVX512DQ-BW-NEXT: vmovdqa64 %zmm5, (%r9)
@@ -1623,12 +1623,12 @@ define void @store_i8_stride5_vf16(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpermd %zmm6, %zmm7, %zmm6
; AVX512DQ-BW-FCP-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm2 = xmm2[8],xmm3[8],xmm2[9],xmm3[9],xmm2[10],xmm3[10],xmm2[11],xmm3[11],xmm2[12],xmm3[12],xmm2[13],xmm3[13],xmm2[14],xmm3[14],xmm2[15],xmm3[15]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,xmm2[10,11],zero,zero,zero,xmm2[12,13],zero,zero,zero,xmm2[14,15],zero
; AVX512DQ-BW-FCP-NEXT: vpunpckhbw {{.*#+}} xmm0 = xmm0[8],xmm1[8],xmm0[9],xmm1[9],xmm0[10],xmm1[10],xmm0[11],xmm1[11],xmm0[12],xmm1[12],xmm0[13],xmm1[13],xmm0[14],xmm1[14],xmm0[15],xmm1[15]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm0 = zero,xmm0[10,11],zero,zero,zero,xmm0[12,13],zero,zero,zero,xmm0[14,15],zero,zero,zero
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm1 = xmm4[12],zero,zero,zero,zero,xmm4[13],zero,zero,zero,zero,xmm4[14],zero,zero,zero,zero,xmm4[15]
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm5 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpternlogq {{.*#+}} xmm1 = xmm1 | xmm0 | xmm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa %xmm1, 64(%r9)
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 %zmm5, (%r9)
@@ -2162,8 +2162,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0]
; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm6 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u,u,26,u,28,u,u,u,u,29,u,31,u,u,30]
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
; AVX2-NEXT: vpshufhw {{.*#+}} ymm7 = ymm3[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm7 = ymm7[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255]
@@ -2196,8 +2196,8 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm10[6],zero,xmm10[8,u],zero,xmm10[7],zero,xmm10[9],zero,xmm10[11,u],zero,xmm10[10],zero,xmm10[12]
; AVX2-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero
; AVX2-NEXT: vpor %xmm10, %xmm9, %xmm9
-; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
; AVX2-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[u],zero,xmm8[u,10],zero,xmm8[12],zero,xmm8[u,11]
+; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
; AVX2-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
; AVX2-NEXT: vpor %xmm7, %xmm8, %xmm7
; AVX2-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
@@ -2210,9 +2210,9 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23]
; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
; AVX2-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
; AVX2-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
; AVX2-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
+; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
; AVX2-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX2-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
@@ -2245,252 +2245,252 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX2-FP-LABEL: store_i8_stride5_vf32:
; AVX2-FP: # %bb.0:
-; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm3
-; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm4
-; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm1
-; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm2
-; AVX2-FP-NEXT: vmovdqa (%r8), %ymm0
-; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm6
-; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm7
-; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm8
-; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm9
-; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
-; AVX2-FP-NEXT: vmovdqa (%r8), %xmm10
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm11 = xmm10[0,0,1,1]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm11 = ymm11[0,0,0,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm12, %ymm5, %ymm11, %ymm5
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
-; AVX2-FP-NEXT: vpor %xmm9, %xmm8, %xmm8
+; AVX2-FP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX2-FP-NEXT: vmovdqa (%rsi), %ymm2
+; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm0
+; AVX2-FP-NEXT: vmovdqa (%rsi), %xmm4
+; AVX2-FP-NEXT: vmovdqa (%rdi), %xmm5
+; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm3 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm3 = xmm3[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm3 = ymm3[0,0,1,1]
+; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm6
+; AVX2-FP-NEXT: vmovdqa (%rcx), %xmm7
+; AVX2-FP-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm8, %ymm3
+; AVX2-FP-NEXT: vmovdqa (%r8), %xmm8
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm9 = xmm8[0,0,1,1]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,0,1]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm3, %ymm9, %ymm3
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[6],zero,xmm6[8],zero,xmm6[u,7],zero,xmm6[9],zero,xmm6[11],zero,xmm6[u,10],zero,xmm6[12],zero
; AVX2-FP-NEXT: vpor %xmm7, %xmm6, %xmm6
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6
-; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm7 = xmm10[1,1,2,2]
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,1,1,1]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} xmm4 = zero,xmm4[8,u],zero,xmm4[7],zero,xmm4[9,u,11,u],zero,xmm4[10],zero,xmm4[12,u],zero
+; AVX2-FP-NEXT: vpor %xmm5, %xmm4, %xmm4
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm5, %ymm6, %ymm4, %ymm4
+; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm5 = xmm8[1,1,2,2]
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm4, %ymm5, %ymm4
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm6 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,ymm1[19,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25],zero,ymm1[23]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero
+; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm5
+; AVX2-FP-NEXT: vpor %ymm6, %ymm7, %ymm6
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm7 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm5[19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25]
; AVX2-FP-NEXT: vpor %ymm7, %ymm8, %ymm7
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
-; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
-; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm0[0,2,1,1,4,6,5,5]
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
+; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
+; AVX2-FP-NEXT: vmovdqa (%r8), %ymm7
+; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm7[0,2,1,1,4,6,5,5]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
-; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm6, %ymm8, %ymm6
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[29,26],zero,ymm0[28],zero,ymm0[30],zero,ymm0[28,29],zero,ymm0[31],zero,ymm0[29]
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm5[27],zero,zero,ymm5[26],zero,ymm5[28],zero,ymm5[30],zero,zero,ymm5[29],zero,ymm5[31],zero
; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[26,27,28,29],zero,ymm1[31],zero,ymm1[29,30],zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,zero,zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30]
; AVX2-FP-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
-; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[2,2,3,3,6,6,7,7]
+; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm9 = ymm7[2,2,3,3,6,6,7,7]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX2-FP-NEXT: vpermd %ymm3, %ymm9, %ymm3
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
+; AVX2-FP-NEXT: vpermd %ymm1, %ymm9, %ymm1
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
-; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
-; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
-; AVX2-FP-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX2-FP-NEXT: vpblendvb %ymm9, %ymm1, %ymm2, %ymm1
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm5[13],zero,zero,zero,zero,ymm5[14],zero,zero,zero,zero,ymm5[15],zero,zero,zero,zero,ymm5[16],zero,zero,zero,zero,ymm5[17],zero,zero,zero,zero,ymm5[18],zero,zero
+; AVX2-FP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero,zero,zero
+; AVX2-FP-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
-; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
-; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4]
-; AVX2-FP-NEXT: vpermd %ymm0, %ymm2, %ymm0
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [3,3,3,3,0,4,4,4]
+; AVX2-FP-NEXT: vpermd %ymm7, %ymm1, %ymm1
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
+; AVX2-FP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqa %ymm0, 64(%r9)
; AVX2-FP-NEXT: vmovdqa %ymm8, 128(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm7, 96(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm6, 32(%r9)
-; AVX2-FP-NEXT: vmovdqa %ymm5, (%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm6, 96(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm4, 32(%r9)
+; AVX2-FP-NEXT: vmovdqa %ymm3, (%r9)
; AVX2-FP-NEXT: vzeroupper
; AVX2-FP-NEXT: retq
;
; AVX2-FCP-LABEL: store_i8_stride5_vf32:
; AVX2-FCP: # %bb.0:
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm3
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm4
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm1
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm2
-; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm0
-; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm6
-; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm7
-; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %ymm2
+; AVX2-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm0
+; AVX2-FCP-NEXT: vmovdqa (%rsi), %xmm5
+; AVX2-FCP-NEXT: vmovdqa (%rdi), %xmm6
+; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm1 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm4 = xmm1[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm1
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm7
+; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm8
+; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm9 = xmm8[0],xmm7[0],xmm8[1],xmm7[1],xmm8[2],xmm7[2],xmm8[3],xmm7[3],xmm8[4],xmm7[4],xmm8[5],xmm7[5],xmm8[6],xmm7[6],xmm8[7],xmm7[7]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = xmm9[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm9, %ymm4
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [0,0,0,0,0,0,1,1]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm9
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm9, %ymm4
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[6],zero,xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[11],zero,xmm7[u,10],zero,xmm7[12],zero
+; AVX2-FCP-NEXT: vpor %xmm7, %xmm8, %xmm7
+; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[0,0,1,1]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[8],zero,xmm6[u,7],zero,xmm6[9],zero,xmm6[u],zero,xmm6[u,10],zero,xmm6[12],zero,xmm6[u,11]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm5 = zero,xmm5[8,u],zero,xmm5[7],zero,xmm5[9,u,11,u],zero,xmm5[10],zero,xmm5[12,u],zero
+; AVX2-FCP-NEXT: vpor %xmm6, %xmm5, %xmm5
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %xmm8
-; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm9
-; AVX2-FCP-NEXT: vpunpcklbw {{.*#+}} xmm10 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm10[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm10 = [0,0,0,0,0,0,1,1]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm10, %ymm10
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm11, %ymm5, %ymm10, %ymm5
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
-; AVX2-FCP-NEXT: vpor %xmm9, %xmm8, %xmm8
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm7 = xmm7[8],zero,xmm7[u,7],zero,xmm7[9],zero,xmm7[u],zero,xmm7[u,10],zero,xmm7[12],zero,xmm7[u,11]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} xmm6 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
-; AVX2-FCP-NEXT: vpor %xmm7, %xmm6, %xmm6
-; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm8, %ymm6, %ymm6
-; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [1,1,2,2,2,2,2,2]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm7, %ymm7
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm6
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,ymm3[19,20],zero,ymm3[22],zero,ymm3[24],zero,ymm3[22,23],zero,ymm3[25],zero,ymm3[23]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm7, %ymm5, %ymm5
+; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm6 = [1,1,2,2,2,2,2,2]
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm5, %ymm6, %ymm5
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,ymm2[19,20],zero,ymm2[22],zero,ymm2[24],zero,ymm2[22,23],zero,ymm2[25],zero,ymm2[23]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero
+; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm6
; AVX2-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm6[19],zero,ymm6[21],zero,zero,ymm6[20],zero,ymm6[22],zero,ymm6[24],zero,zero,ymm6[23],zero
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[19],zero,ymm0[21],zero,zero,ymm0[20],zero,ymm0[22],zero,ymm0[24],zero,zero,ymm0[23],zero,ymm0[25]
; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm8 = [4,6,5,5,5,5,4,6]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm8, %ymm8
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm8, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm7, %ymm8, %ymm7
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,ymm1[29,26],zero,ymm1[28],zero,ymm1[30],zero,ymm1[28,29],zero,ymm1[31],zero,ymm1[29]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,ymm0[29,26],zero,ymm0[28],zero,ymm0[30],zero,ymm0[28,29],zero,ymm0[31],zero,ymm0[29]
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm6[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm6[27],zero,zero,ymm6[26],zero,ymm6[28],zero,ymm6[30],zero,zero,ymm6[29],zero,ymm6[31],zero
; AVX2-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[29,26],zero,ymm2[28],zero,ymm2[26,27,28,29],zero,ymm2[31],zero,ymm2[29,30],zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[29,26],zero,ymm3[28],zero,ymm3[26,27,28,29],zero,ymm3[31],zero,ymm3[29,30],zero
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm4[26],zero,ymm4[28],zero,zero,zero,zero,ymm4[29],zero,ymm4[31],zero,zero,ymm4[30]
; AVX2-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [6,6,6,6,7,7,7,7]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm9, %ymm9
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm9, %ymm9
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm8, %ymm9, %ymm8
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm9 = [3,3,3,0,4,4,4,4]
-; AVX2-FCP-NEXT: vpermd %ymm3, %ymm9, %ymm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
+; AVX2-FCP-NEXT: vpermd %ymm2, %ymm9, %ymm2
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm9 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm3, %ymm4, %ymm3
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm2 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
-; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm1 = zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
-; AVX2-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
-; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm3, %ymm1, %ymm1
+; AVX2-FCP-NEXT: vpblendvb %ymm9, %ymm2, %ymm3, %ymm2
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm3 = zero,zero,zero,zero,ymm6[13],zero,zero,zero,zero,ymm6[14],zero,zero,zero,zero,ymm6[15],zero,zero,zero,zero,ymm6[16],zero,zero,zero,zero,ymm6[17],zero,zero,zero,zero,ymm6[18],zero,zero
+; AVX2-FCP-NEXT: vpshufb {{.*#+}} ymm0 = zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero,zero,zero
+; AVX2-FCP-NEXT: vpor %ymm3, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm3 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
+; AVX2-FCP-NEXT: vpblendvb %ymm3, %ymm2, %ymm0, %ymm0
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4]
-; AVX2-FCP-NEXT: vpermd %ymm0, %ymm2, %ymm0
+; AVX2-FCP-NEXT: vpermd %ymm1, %ymm2, %ymm1
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm2 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm1, %ymm0, %ymm0
+; AVX2-FCP-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm0
; AVX2-FCP-NEXT: vmovdqa %ymm0, 64(%r9)
; AVX2-FCP-NEXT: vmovdqa %ymm8, 128(%r9)
; AVX2-FCP-NEXT: vmovdqa %ymm7, 96(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm6, 32(%r9)
-; AVX2-FCP-NEXT: vmovdqa %ymm5, (%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm5, 32(%r9)
+; AVX2-FCP-NEXT: vmovdqa %ymm4, (%r9)
; AVX2-FCP-NEXT: vzeroupper
; AVX2-FCP-NEXT: retq
;
; AVX512-LABEL: store_i8_stride5_vf32:
; AVX512: # %bb.0:
-; AVX512-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512-NEXT: vmovdqa (%rsi), %ymm4
-; AVX512-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512-NEXT: vmovdqa (%rcx), %ymm2
-; AVX512-NEXT: vmovdqa (%r8), %ymm0
+; AVX512-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512-NEXT: vmovdqa (%rdx), %ymm0
; AVX512-NEXT: vmovdqa (%rdi), %xmm5
; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11]
+; AVX512-NEXT: vmovdqa (%rcx), %ymm4
; AVX512-NEXT: vmovdqa (%rsi), %xmm7
; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
+; AVX512-NEXT: vmovdqa (%r8), %ymm1
+; AVX512-NEXT: vmovdqa (%rcx), %xmm9
+; AVX512-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
; AVX512-NEXT: vpor %xmm6, %xmm8, %xmm6
+; AVX512-NEXT: vmovdqa (%rdx), %xmm8
+; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
-; AVX512-NEXT: vmovdqa (%rdx), %xmm10
-; AVX512-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero
-; AVX512-NEXT: vpor %xmm9, %xmm11, %xmm9
-; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512-NEXT: vpor %xmm10, %xmm11, %xmm10
+; AVX512-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6))
-; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6
-; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm11 & (ymm10 ^ ymm6))
+; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm8
; AVX512-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
; AVX512-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm8 ^ (ymm7 & (ymm5 ^ ymm8))
-; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm7 & (ymm5 ^ ymm6))
+; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7]
; AVX512-NEXT: vmovdqa (%r8), %xmm6
; AVX512-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512-NEXT: vpermd %zmm6, %zmm8, %zmm6
; AVX512-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero
; AVX512-NEXT: vpor %ymm5, %ymm8, %ymm5
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19]
-; AVX512-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19]
+; AVX512-NEXT: vpor %ymm9, %ymm8, %ymm8
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5))
-; AVX512-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u]
-; AVX512-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm5
+; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero
; AVX512-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
; AVX512-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX512-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5))
; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5
; AVX512-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero
-; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[12],zero,zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero
+; AVX512-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,2,1,1,4,6,5,5]
; AVX512-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2]
; AVX512-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
; AVX512-NEXT: vpandn %ymm9, %ymm10, %ymm9
; AVX512-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
; AVX512-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
-; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30]
-; AVX512-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
-; AVX512-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero
-; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u]
-; AVX512-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3))
-; AVX512-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30]
+; AVX512-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
+; AVX512-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[27,u],zero,ymm4[26],zero,ymm4[28],zero,ymm4[30,u],zero,ymm4[29],zero,ymm4[31,u]
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
-; AVX512-NEXT: vmovdqa %ymm0, 128(%r9)
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm7 & (ymm0 ^ ymm2))
+; AVX512-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
+; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
+; AVX512-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512-NEXT: vmovdqa %ymm1, 128(%r9)
; AVX512-NEXT: vmovdqa64 %zmm8, 64(%r9)
; AVX512-NEXT: vmovdqa64 %zmm6, (%r9)
; AVX512-NEXT: vzeroupper
@@ -2498,18 +2498,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512-FCP-LABEL: store_i8_stride5_vf32:
; AVX512-FCP: # %bb.0:
-; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512-FCP-NEXT: vmovdqa (%rsi), %ymm2
; AVX512-FCP-NEXT: vmovdqa (%rdx), %ymm0
-; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm1
; AVX512-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
+; AVX512-FCP-NEXT: vmovdqa (%rcx), %ymm3
; AVX512-FCP-NEXT: vmovdqa (%rsi), %xmm6
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
; AVX512-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
; AVX512-FCP-NEXT: vmovdqa (%rcx), %xmm7
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
; AVX512-FCP-NEXT: vmovdqa (%rdx), %xmm9
; AVX512-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero
; AVX512-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
@@ -2530,18 +2530,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
; AVX512-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7
; AVX512-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u]
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u,u],zero
; AVX512-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19]
-; AVX512-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u,u,19]
+; AVX512-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21,u],zero,ymm2[20],zero,ymm2[22],zero,ymm2[24,u],zero,ymm2[23],zero,ymm2[25,u]
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5))
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u]
-; AVX512-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512-FCP-NEXT: vpor %ymm9, %ymm11, %ymm5
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
; AVX512-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
@@ -2555,15 +2555,15 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero
; AVX512-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30]
-; AVX512-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm2[26],zero,ymm2[28,u],zero,ymm2[u],zero,ymm2[29],zero,ymm2[31,u],zero,ymm2[30]
+; AVX512-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
-; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u]
-; AVX512-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27,u],zero,ymm3[26],zero,ymm3[28],zero,ymm3[30,u],zero,ymm3[29],zero,ymm3[31,u]
+; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
+; AVX512-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2))
+; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm6 & (ymm0 ^ ymm1))
; AVX512-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7]
; AVX512-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX512-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
@@ -2575,77 +2575,77 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-LABEL: store_i8_stride5_vf32:
; AVX512DQ: # %bb.0:
-; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm3
-; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm4
-; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm1
-; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm2
-; AVX512DQ-NEXT: vmovdqa (%r8), %ymm0
+; AVX512DQ-NEXT: vmovdqa (%rdi), %ymm2
+; AVX512DQ-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512DQ-NEXT: vmovdqa (%rdx), %ymm0
; AVX512DQ-NEXT: vmovdqa (%rdi), %xmm5
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm5[8],zero,xmm5[u,7],zero,xmm5[9],zero,xmm5[u],zero,xmm5[u,10],zero,xmm5[12],zero,xmm5[u,11]
+; AVX512DQ-NEXT: vmovdqa (%rcx), %ymm4
; AVX512DQ-NEXT: vmovdqa (%rsi), %xmm7
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9,u,11,u],zero,xmm7[10],zero,xmm7[12,u],zero
+; AVX512DQ-NEXT: vmovdqa (%r8), %ymm1
+; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm9
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm10 = zero,xmm9[6],zero,xmm9[8,u],zero,xmm9[7],zero,xmm9[9],zero,xmm9[11,u],zero,xmm9[10],zero,xmm9[12]
; AVX512DQ-NEXT: vpor %xmm6, %xmm8, %xmm6
+; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm8
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm8[6],zero,xmm8[8],zero,xmm8[u,7],zero,xmm8[9],zero,xmm8[11],zero,xmm8[u,10],zero,xmm8[12],zero
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
-; AVX512DQ-NEXT: vmovdqa (%rcx), %xmm8
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm9 = zero,xmm8[6],zero,xmm8[8,u],zero,xmm8[7],zero,xmm8[9],zero,xmm8[11,u],zero,xmm8[10],zero,xmm8[12]
-; AVX512DQ-NEXT: vmovdqa (%rdx), %xmm10
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm11 = xmm10[6],zero,xmm10[8],zero,xmm10[u,7],zero,xmm10[9],zero,xmm10[11],zero,xmm10[u,10],zero,xmm10[12],zero
-; AVX512DQ-NEXT: vpor %xmm9, %xmm11, %xmm9
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[0,0,1,1]
+; AVX512DQ-NEXT: vpor %xmm10, %xmm11, %xmm10
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm10 = ymm10[0,0,1,1]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm6 ^ (ymm11 & (ymm9 ^ ymm6))
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm6
-; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm8 = xmm8[0],xmm10[0],xmm8[1],xmm10[1],xmm8[2],xmm10[2],xmm8[3],xmm10[3],xmm8[4],xmm10[4],xmm8[5],xmm10[5],xmm8[6],xmm10[6],xmm8[7],xmm10[7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm8 = xmm8[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm8 = ymm8[0,0,1,1]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm10 = ymm6 ^ (ymm11 & (ymm10 ^ ymm6))
+; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm6 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm6 = xmm6[2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm0, %zmm8
; AVX512DQ-NEXT: vpunpcklbw {{.*#+}} xmm5 = xmm5[0],xmm7[0],xmm5[1],xmm7[1],xmm5[2],xmm7[2],xmm5[3],xmm7[3],xmm5[4],xmm7[4],xmm5[5],xmm7[5],xmm5[6],xmm7[6],xmm5[7],xmm7[7]
; AVX512DQ-NEXT: vpshufb {{.*#+}} xmm5 = xmm5[0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm6 = ymm6[0,0,1,1]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm7 = [255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255,0,0,255,255,255]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm8 ^ (ymm7 & (ymm5 ^ ymm8))
-; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm6[4,5,6,7]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm5 = ymm6 ^ (ymm7 & (ymm5 ^ ymm6))
+; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm5[0,1,2,3],zmm8[4,5,6,7]
; AVX512DQ-NEXT: vmovdqa (%r8), %xmm6
; AVX512DQ-NEXT: vpmovsxbd {{.*#+}} zmm8 = [0,0,0,0,0,0,1,1,1,1,0,2,2,2,2,0]
; AVX512DQ-NEXT: vpermd %zmm6, %zmm8, %zmm6
; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm6 = zmm6 ^ (mem & (zmm6 ^ zmm5))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,u,u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm4[u,u,u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero
; AVX512DQ-NEXT: vpor %ymm5, %ymm8, %ymm5
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm4[u],zero,ymm4[13,u,u,u],zero,ymm4[14,u,u,u],zero,ymm4[15,u,u,u],zero,ymm4[16,u,u,u],zero,ymm4[17,u,u,u],zero,ymm4[18,u,u,u],zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,13],zero,ymm3[u,u,u,14],zero,ymm3[u,u,u,15],zero,ymm3[u,u,u,16],zero,ymm3[u,u,u,17],zero,ymm3[u,u,u,18],zero,ymm3[u,u,u,19]
-; AVX512DQ-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19]
+; AVX512DQ-NEXT: vpor %ymm9, %ymm8, %ymm8
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm11 & (ymm8 ^ ymm5))
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero,zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm4[21,u],zero,ymm4[20],zero,ymm4[22],zero,ymm4[24,u],zero,ymm4[23],zero,ymm4[25,u]
-; AVX512DQ-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm5
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm4[19],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
; AVX512DQ-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm9 = ymm9 ^ (mem & (ymm9 ^ ymm5))
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm0, %zmm5
; AVX512DQ-NEXT: vshufi64x2 {{.*#+}} zmm5 = zmm8[0,1,2,3],zmm5[4,5,6,7]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[12],zero,zero,zero,zero,ymm0[13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,zero,ymm0[15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,zero,ymm0[17],zero,zero,zero,zero,ymm0[18],zero
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm0[0,2,1,1,4,6,5,5]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[12],zero,zero,zero,zero,ymm1[13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,zero,ymm1[15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,zero,ymm1[17],zero,zero,zero,zero,ymm1[18],zero
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm9 = ymm1[0,2,1,1,4,6,5,5]
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,3,3,2]
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
; AVX512DQ-NEXT: vpandn %ymm9, %ymm10, %ymm9
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm9, %zmm8, %zmm8
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,ymm3[27],zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30],zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm4 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm4[26],zero,ymm4[28,u],zero,ymm4[u],zero,ymm4[29],zero,ymm4[31,u],zero,ymm4[30]
-; AVX512DQ-NEXT: vpor %ymm3, %ymm4, %ymm3
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[27],zero,zero,ymm1[26],zero,ymm1[28],zero,ymm1[30],zero,zero,ymm1[29],zero,ymm1[31],zero,zero
-; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27,u],zero,ymm2[26],zero,ymm2[28],zero,ymm2[30,u],zero,ymm2[29],zero,ymm2[31,u]
-; AVX512DQ-NEXT: vpor %ymm1, %ymm2, %ymm1
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm3 ^ (ymm7 & (ymm1 ^ ymm3))
-; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm0 = ymm0[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30]
+; AVX512DQ-NEXT: vpor %ymm2, %ymm3, %ymm2
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
+; AVX512DQ-NEXT: vpshufb {{.*#+}} ymm3 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm4[27,u],zero,ymm4[26],zero,ymm4[28],zero,ymm4[30,u],zero,ymm4[29],zero,ymm4[31,u]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512DQ-NEXT: vpor %ymm0, %ymm3, %ymm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
-; AVX512DQ-NEXT: vmovdqa %ymm0, 128(%r9)
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm7 & (ymm0 ^ ymm2))
+; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm1 = ymm1[2,2,3,3,6,6,7,7]
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
+; AVX512DQ-NEXT: vmovdqa %ymm1, 128(%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm8, 64(%r9)
; AVX512DQ-NEXT: vmovdqa64 %zmm6, (%r9)
; AVX512DQ-NEXT: vzeroupper
@@ -2653,18 +2653,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
;
; AVX512DQ-FCP-LABEL: store_i8_stride5_vf32:
; AVX512DQ-FCP: # %bb.0:
-; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm2
-; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %ymm1
+; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %ymm2
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %ymm0
-; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm1
; AVX512DQ-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
+; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %ymm3
; AVX512DQ-FCP-NEXT: vmovdqa (%rsi), %xmm6
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
; AVX512DQ-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
; AVX512DQ-FCP-NEXT: vmovdqa (%rcx), %xmm7
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm8 = zero,xmm7[6],zero,xmm7[8,u],zero,xmm7[7],zero,xmm7[9],zero,xmm7[11,u],zero,xmm7[10],zero,xmm7[12]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
; AVX512DQ-FCP-NEXT: vmovdqa (%rdx), %xmm9
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} xmm10 = xmm9[6],zero,xmm9[8],zero,xmm9[u,7],zero,xmm9[9],zero,xmm9[11],zero,xmm9[u,10],zero,xmm9[12],zero
; AVX512DQ-FCP-NEXT: vpor %xmm8, %xmm10, %xmm8
@@ -2685,18 +2685,18 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} zmm7 = [0,0,0,0,0,0,1,1,9,9,0,10,10,10,10,0]
; AVX512DQ-FCP-NEXT: vpermd %zmm4, %zmm7, %zmm7
; AVX512DQ-FCP-NEXT: vpternlogd {{.*#+}} zmm7 = zmm7 ^ (mem & (zmm7 ^ zmm5))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm1[u,u,u],zero,ymm1[13,u,u,u],zero,ymm1[14,u,u,u],zero,ymm1[15,u,u,u],zero,ymm1[16,u,u,u],zero,ymm1[17,u,u,u],zero,ymm1[18,u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm3[u,u,u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u]
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,13],zero,ymm0[u,u,u,14],zero,ymm0[u,u,u,15],zero,ymm0[u,u,u,16],zero,ymm0[u,u,u,17],zero,ymm0[u,u,u,18],zero,ymm0[u,u]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u],zero,ymm2[13,u,u,u],zero,ymm2[14,u,u,u],zero,ymm2[15,u,u,u],zero,ymm2[16,u,u,u],zero,ymm2[17,u,u,u],zero,ymm2[18,u,u,u],zero
; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm8, %ymm5
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u],zero,ymm3[13,u,u,u],zero,ymm3[14,u,u,u],zero,ymm3[15,u,u,u],zero,ymm3[16,u,u,u],zero,ymm3[17,u,u,u],zero,ymm3[18,u,u,u],zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm2[u,13],zero,ymm2[u,u,u,14],zero,ymm2[u,u,u,15],zero,ymm2[u,u,u,16],zero,ymm2[u,u,u,17],zero,ymm2[u,u,u,18],zero,ymm2[u,u,u,19]
-; AVX512DQ-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm1[u,13],zero,ymm1[u,u,u,14],zero,ymm1[u,u,u,15],zero,ymm1[u,u,u,16],zero,ymm1[u,u,u,17],zero,ymm1[u,u,u,18],zero,ymm1[u,u,u,19]
+; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm8, %ymm8
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm11 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21,u],zero,ymm2[20],zero,ymm2[22],zero,ymm2[24,u],zero,ymm2[23],zero,ymm2[25,u]
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm8 = ymm5 ^ (ymm10 & (ymm8 ^ ymm5))
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21,u],zero,ymm3[20],zero,ymm3[22],zero,ymm3[24,u],zero,ymm3[23],zero,ymm3[25,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm5, %ymm9, %ymm5
+; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm11, %ymm5
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm5 = ymm5[2,2,3,3]
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[19],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm10 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
; AVX512DQ-FCP-NEXT: vpor %ymm9, %ymm10, %ymm9
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
@@ -2710,15 +2710,15 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[12],zero,zero,zero,zero,ymm4[13],zero,zero,zero,zero,ymm4[14],zero,zero,zero,zero,ymm4[15],zero,zero,zero,zero,ymm4[16],zero,zero,zero,zero,ymm4[17],zero,zero,zero,zero,ymm4[18],zero
; AVX512DQ-FCP-NEXT: vinserti64x4 $1, %ymm8, %zmm9, %zmm8
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} zmm8 = zmm8 | (zmm5 & mem)
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,ymm2[27],zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30],zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm3[26],zero,ymm3[28,u],zero,ymm3[u],zero,ymm3[29],zero,ymm3[31,u],zero,ymm3[30]
-; AVX512DQ-FCP-NEXT: vpor %ymm2, %ymm3, %ymm2
-; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27,u],zero,ymm2[26],zero,ymm2[28,u],zero,ymm2[u],zero,ymm2[29],zero,ymm2[31,u],zero,ymm2[30]
+; AVX512DQ-FCP-NEXT: vpor %ymm1, %ymm2, %ymm1
; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
-; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm1[27,u],zero,ymm1[26],zero,ymm1[28],zero,ymm1[30,u],zero,ymm1[29],zero,ymm1[31,u]
-; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm1, %ymm0
+; AVX512DQ-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27,u],zero,ymm3[26],zero,ymm3[28],zero,ymm3[30,u],zero,ymm3[29],zero,ymm3[31,u]
+; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
+; AVX512DQ-FCP-NEXT: vpor %ymm0, %ymm2, %ymm0
; AVX512DQ-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm2 ^ (ymm6 & (ymm0 ^ ymm2))
+; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm0 = ymm1 ^ (ymm6 & (ymm0 ^ ymm1))
; AVX512DQ-FCP-NEXT: vpmovsxbd {{.*#+}} ymm1 = [6,6,6,0,7,7,7,7]
; AVX512DQ-FCP-NEXT: vpermd %ymm4, %ymm1, %ymm1
; AVX512DQ-FCP-NEXT: vpternlogq {{.*#+}} ymm1 = ymm1 ^ (mem & (ymm1 ^ ymm0))
@@ -2761,25 +2761,25 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-NEXT: vpermd %zmm6, %zmm7, %zmm6
; AVX512BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512BW-NEXT: kmovq %rax, %k1
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1}
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
-; AVX512BW-NEXT: vpor %ymm6, %ymm7, %ymm6
+; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm6
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
; AVX512BW-NEXT: vpor %ymm7, %ymm8, %ymm7
; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
-; AVX512BW-NEXT: vpermd %ymm4, %ymm7, %ymm7
+; AVX512BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,3,3,0,4,4,4,4]
+; AVX512BW-NEXT: vpermd %ymm4, %ymm8, %ymm8
; AVX512BW-NEXT: movl $138547332, %eax # imm = 0x8421084
; AVX512BW-NEXT: kmovd %eax, %k1
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero
-; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
-; AVX512BW-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX512BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
+; AVX512BW-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
+; AVX512BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
; AVX512BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
; AVX512BW-NEXT: kmovq %rax, %k1
; AVX512BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
@@ -2822,11 +2822,11 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-LABEL: store_i8_stride5_vf32:
; AVX512BW-FCP: # %bb.0:
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %ymm2
; AVX512BW-FCP-NEXT: vmovdqa (%rdx), %ymm0
-; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm2
; AVX512BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
+; AVX512BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
; AVX512BW-FCP-NEXT: vmovdqa (%rsi), %xmm6
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
; AVX512BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
@@ -2853,19 +2853,19 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero,zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero
; AVX512BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
-; AVX512BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm7
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
; AVX512BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
; AVX512BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
; AVX512BW-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm7
; AVX512BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084
; AVX512BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm2[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
; AVX512BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
@@ -2877,12 +2877,12 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6
; AVX512BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
; AVX512BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30]
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,zero,zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30]
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
-; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
+; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27],zero,zero,ymm3[26],zero,ymm3[28],zero,ymm3[30],zero,zero,ymm3[29],zero,ymm3[31],zero
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
; AVX512BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
@@ -2933,25 +2933,25 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-NEXT: vpermd %zmm6, %zmm7, %zmm6
; AVX512DQ-BW-NEXT: movabsq $595056260442243600, %rax # imm = 0x842108421084210
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm3 {%k1}
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm1[12,13],zero,zero,zero,zero,ymm1[14],zero,zero,zero,ymm1[14,15],zero,zero,zero,zero,ymm1[16],zero,zero,zero,ymm1[16,17],zero,zero,zero,zero,ymm1[18],zero,zero,zero
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm6, %ymm7, %ymm6
+; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm6
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm1[21],zero,ymm1[21,20],zero,ymm1[22],zero,ymm1[24],zero,ymm1[22,23],zero,ymm1[25]
; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm8, %ymm7
; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
-; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
-; AVX512DQ-BW-NEXT: vpermd %ymm4, %ymm7, %ymm7
+; AVX512DQ-BW-NEXT: vpmovsxbd {{.*#+}} ymm8 = [3,3,3,0,4,4,4,4]
+; AVX512DQ-BW-NEXT: vpermd %ymm4, %ymm8, %ymm8
; AVX512DQ-BW-NEXT: movl $138547332, %eax # imm = 0x8421084
; AVX512DQ-BW-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero
-; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
-; AVX512DQ-BW-NEXT: vpor %ymm8, %ymm9, %ymm8
-; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm8, %zmm7, %zmm7
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm8 {%k1} = ymm5[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm9 = ymm5[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm5[21],zero,zero,ymm5[20],zero,ymm5[22],zero,ymm5[24],zero,zero,ymm5[23],zero,ymm5[25],zero
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
+; AVX512DQ-BW-NEXT: vpshufb {{.*#+}} ymm7 = ymm4[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm4[21],zero,zero,ymm4[20],zero,ymm4[22],zero,ymm4[24],zero,zero,ymm4[23],zero,ymm4[25],zero,zero
+; AVX512DQ-BW-NEXT: vpor %ymm7, %ymm9, %ymm7
+; AVX512DQ-BW-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
+; AVX512DQ-BW-NEXT: vinserti64x4 $1, %ymm7, %zmm8, %zmm7
; AVX512DQ-BW-NEXT: movabsq $-8330787646191410408, %rax # imm = 0x8C6318C6318C6318
; AVX512DQ-BW-NEXT: kmovq %rax, %k1
; AVX512DQ-BW-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
@@ -2994,11 +2994,11 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-LABEL: store_i8_stride5_vf32:
; AVX512DQ-BW-FCP: # %bb.0:
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %ymm1
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm3
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %ymm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdx), %ymm0
-; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm2
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rdi), %xmm4
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm5 = xmm4[8],zero,xmm4[u,7],zero,xmm4[9],zero,xmm4[u],zero,xmm4[u,10],zero,xmm4[12],zero,xmm4[u,11]
+; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rcx), %ymm3
; AVX512DQ-BW-FCP-NEXT: vmovdqa (%rsi), %xmm6
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} xmm7 = zero,xmm6[8,u],zero,xmm6[7],zero,xmm6[9,u,11,u],zero,xmm6[10],zero,xmm6[12,u],zero
; AVX512DQ-BW-FCP-NEXT: vpor %xmm5, %xmm7, %xmm5
@@ -3025,19 +3025,19 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm4 {%k1}
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm6 = zero,zero,ymm0[12,13],zero,zero,zero,zero,ymm0[14],zero,zero,zero,ymm0[14,15],zero,zero,zero,zero,ymm0[16],zero,zero,zero,ymm0[16,17],zero,zero,zero,zero,ymm0[18],zero,zero,zero
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm2[13],zero,zero,zero,zero,ymm2[14],zero,zero,zero,zero,ymm2[15],zero,zero,zero,zero,ymm2[16],zero,zero,zero,zero,ymm2[17],zero,zero,zero,zero,ymm2[18],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = zero,zero,zero,zero,ymm3[13],zero,zero,zero,zero,ymm3[14],zero,zero,zero,zero,ymm3[15],zero,zero,zero,zero,ymm3[16],zero,zero,zero,zero,ymm3[17],zero,zero,zero,zero,ymm3[18],zero,zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm3[19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero
; AVX512DQ-BW-FCP-NEXT: vpor %ymm6, %ymm7, %ymm6
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,18,19],zero,ymm0[21],zero,ymm0[21,20],zero,ymm0[22],zero,ymm0[24],zero,ymm0[22,23],zero,ymm0[25]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm2[19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm7, %ymm8, %ymm7
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm7
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm7 = ymm7[2,2,3,3]
; AVX512DQ-BW-FCP-NEXT: vinserti64x4 $1, %ymm7, %zmm6, %zmm6
; AVX512DQ-BW-FCP-NEXT: vpmovsxbd {{.*#+}} ymm7 = [3,3,3,0,4,4,4,4]
; AVX512DQ-BW-FCP-NEXT: vpermd %ymm1, %ymm7, %ymm7
; AVX512DQ-BW-FCP-NEXT: movl $138547332, %eax # imm = 0x8421084
; AVX512DQ-BW-FCP-NEXT: kmovd %eax, %k1
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm3[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm3[21],zero,zero,ymm3[20],zero,ymm3[22],zero,ymm3[24],zero,zero,ymm3[23],zero,ymm3[25],zero
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm7 {%k1} = ymm2[u,u,13,u,u,u,u,14,u,u,u,u,15,u,u,u,u,16,u,u,u,u,17,u,u,u,u,18,u,u,u,u]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm8 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,19],zero,ymm2[21],zero,zero,ymm2[20],zero,ymm2[22],zero,ymm2[24],zero,zero,ymm2[23],zero,ymm2[25],zero
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm9 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm1[21],zero,zero,ymm1[20],zero,ymm1[22],zero,ymm1[24],zero,zero,ymm1[23],zero,ymm1[25],zero,zero
; AVX512DQ-BW-FCP-NEXT: vpor %ymm8, %ymm9, %ymm8
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
@@ -3049,12 +3049,12 @@ define void @store_i8_stride5_vf32(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpermd %zmm5, %zmm6, %zmm6
; AVX512DQ-BW-FCP-NEXT: movabsq $1190112520884487201, %rax # imm = 0x1084210842108421
; AVX512DQ-BW-FCP-NEXT: kmovq %rax, %k1
-; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm3[26],zero,ymm3[28],zero,zero,zero,zero,ymm3[29],zero,ymm3[31],zero,zero,ymm3[30]
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm2[26],zero,ymm2[28],zero,zero,zero,zero,ymm2[29],zero,ymm2[31],zero,zero,ymm2[30]
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm1 = ymm1[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm1[26],zero,ymm1[28],zero,zero,ymm1[27],zero,ymm1[29],zero,ymm1[31],zero,zero,ymm1[30],zero
-; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vmovdqu8 %zmm6, %zmm7 {%k1}
+; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm1, %ymm1
+; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm3[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm3[27],zero,zero,ymm3[26],zero,ymm3[28],zero,ymm3[30],zero,zero,ymm3[29],zero,ymm3[31],zero
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm2 = ymm2[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25],zero,ymm2[27],zero,zero,ymm2[26],zero,ymm2[28],zero,ymm2[30],zero,zero,ymm2[29],zero,ymm2[31],zero
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm0 = ymm0[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,ymm0[27],zero,zero,ymm0[26],zero,ymm0[28],zero,ymm0[30],zero,zero,ymm0[29],zero,ymm0[31],zero,zero
; AVX512DQ-BW-FCP-NEXT: vpor %ymm2, %ymm0, %ymm0
; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
@@ -4107,201 +4107,201 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-LABEL: store_i8_stride5_vf64:
; AVX2: # %bb.0:
; AVX2-NEXT: subq $248, %rsp
-; AVX2-NEXT: vmovdqa 32(%rdi), %ymm4
+; AVX2-NEXT: vmovdqa 32(%rdi), %ymm6
; AVX2-NEXT: vmovdqa (%rcx), %xmm1
; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa 32(%rcx), %xmm7
-; AVX2-NEXT: vmovdqa %xmm7, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa 32(%rcx), %xmm9
+; AVX2-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [128,6,128,8,u,128,7,128,9,128,11,u,128,10,128,12]
; AVX2-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-NEXT: vmovdqa (%rdx), %xmm3
; AVX2-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10
-; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
; AVX2-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-NEXT: vpor %xmm1, %xmm3, %xmm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX2-NEXT: vmovdqa (%rdi), %xmm5
-; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa (%rdi), %xmm4
+; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vmovdqa {{.*#+}} xmm3 = [8,128,u,7,128,9,128,u,128,u,10,128,12,128,u,11]
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm5
-; AVX2-NEXT: vmovdqa (%rsi), %xmm6
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm8 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
-; AVX2-NEXT: vpshufb %xmm8, %xmm6, %xmm9
-; AVX2-NEXT: vpor %xmm5, %xmm9, %xmm5
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,0,1,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm1, %ymm5, %ymm1
-; AVX2-NEXT: vmovdqa (%r8), %xmm5
-; AVX2-NEXT: vmovdqa %xmm5, (%rsp) # 16-byte Spill
-; AVX2-NEXT: vpshufd {{.*#+}} xmm5 = xmm5[1,1,2,2]
-; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm5[0,1,1,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm4
+; AVX2-NEXT: vmovdqa (%rsi), %xmm7
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm5 = [128,8,u,128,7,128,9,u,11,u,128,10,128,12,u,128]
+; AVX2-NEXT: vpshufb %xmm5, %xmm7, %xmm8
+; AVX2-NEXT: vpor %xmm4, %xmm8, %xmm4
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm8 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vmovdqa 32(%rdx), %xmm10
+; AVX2-NEXT: vmovdqa %xmm10, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vmovdqa (%r8), %xmm4
+; AVX2-NEXT: vmovdqa %xmm4, (%rsp) # 16-byte Spill
+; AVX2-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,1,1,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm11 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm11, %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa 32(%rdi), %xmm5
-; AVX2-NEXT: vmovdqa %xmm5, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm0, %xmm7, %xmm0
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm4
+; AVX2-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-NEXT: vpshufb %xmm0, %xmm9, %xmm0
; AVX2-NEXT: vpshufb %xmm2, %xmm10, %xmm1
; AVX2-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-NEXT: vmovdqa 32(%rsi), %xmm2
; AVX2-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-NEXT: vpshufb %xmm3, %xmm5, %xmm1
-; AVX2-NEXT: vpshufb %xmm8, %xmm2, %xmm2
-; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX2-NEXT: vmovdqa 32(%rsi), %ymm11
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
+; AVX2-NEXT: vpshufb %xmm3, %xmm4, %xmm1
+; AVX2-NEXT: vpshufb %xmm5, %xmm2, %xmm2
+; AVX2-NEXT: vpor %xmm1, %xmm2, %xmm1
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX2-NEXT: vpblendvb %ymm9, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqa 32(%r8), %xmm1
; AVX2-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-NEXT: vpshufd {{.*#+}} xmm1 = xmm1[1,1,2,2]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
-; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm1, %ymm0
+; AVX2-NEXT: vpblendvb %ymm11, %ymm0, %ymm1, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm15 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
-; AVX2-NEXT: vpshufb %ymm15, %ymm4, %ymm1
-; AVX2-NEXT: vmovdqa %ymm4, %ymm13
-; AVX2-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
-; AVX2-NEXT: # ymm4 = mem[0,1,0,1]
-; AVX2-NEXT: vpshufb %ymm4, %ymm11, %ymm3
+; AVX2-NEXT: vmovdqa 32(%rsi), %ymm15
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
+; AVX2-NEXT: vpshufb %ymm0, %ymm6, %ymm1
+; AVX2-NEXT: vmovdqa %ymm6, %ymm12
+; AVX2-NEXT: vmovdqu %ymm6, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm6 = [19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128]
+; AVX2-NEXT: # ymm6 = mem[0,1,0,1]
+; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm3
; AVX2-NEXT: vpor %ymm1, %ymm3, %ymm1
-; AVX2-NEXT: vmovdqa 32(%rdx), %ymm12
+; AVX2-NEXT: vmovdqa 32(%rdx), %ymm11
; AVX2-NEXT: vmovdqa 32(%rcx), %ymm14
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm3 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
; AVX2-NEXT: # ymm3 = mem[0,1,0,1]
-; AVX2-NEXT: vpshufb %ymm3, %ymm14, %ymm8
+; AVX2-NEXT: vpshufb %ymm3, %ymm14, %ymm4
; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25]
-; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm10
-; AVX2-NEXT: vpor %ymm8, %ymm10, %ymm8
+; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm8
+; AVX2-NEXT: vpor %ymm4, %ymm8, %ymm4
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
-; AVX2-NEXT: vpblendvb %ymm10, %ymm1, %ymm8, %ymm2
-; AVX2-NEXT: vmovdqa (%rdi), %ymm9
-; AVX2-NEXT: vpshufb %ymm15, %ymm9, %ymm1
-; AVX2-NEXT: vmovdqa (%rsi), %ymm15
-; AVX2-NEXT: vpshufb %ymm4, %ymm15, %ymm4
-; AVX2-NEXT: vpor %ymm1, %ymm4, %ymm4
-; AVX2-NEXT: vmovdqa (%rcx), %ymm7
-; AVX2-NEXT: vpshufb %ymm3, %ymm7, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm1, %ymm4, %ymm4
+; AVX2-NEXT: vmovdqa (%rdi), %ymm13
+; AVX2-NEXT: vpshufb %ymm0, %ymm13, %ymm1
+; AVX2-NEXT: vmovdqa (%rsi), %ymm10
+; AVX2-NEXT: vpshufb %ymm6, %ymm10, %ymm6
+; AVX2-NEXT: vpor %ymm1, %ymm6, %ymm6
+; AVX2-NEXT: vmovdqa (%rcx), %ymm8
+; AVX2-NEXT: vpshufb %ymm3, %ymm8, %ymm0
; AVX2-NEXT: vmovdqa (%rdx), %ymm3
; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm5
; AVX2-NEXT: vpor %ymm0, %ymm5, %ymm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm5 = ymm6[2,2,3,3]
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX2-NEXT: vpblendvb %ymm10, %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vmovdqa 32(%r8), %ymm10
-; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm10[0,2,1,1,4,6,5,5]
-; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm2, %ymm4, %ymm1
-; AVX2-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vmovdqa (%r8), %ymm8
-; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm8[0,2,1,1,4,6,5,5]
+; AVX2-NEXT: vpblendvb %ymm2, %ymm5, %ymm0, %ymm5
+; AVX2-NEXT: vmovdqa 32(%r8), %ymm9
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm9[0,2,1,1,4,6,5,5]
+; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,3,3,2]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
+; AVX2-NEXT: vpblendvb %ymm6, %ymm4, %ymm2, %ymm0
+; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-NEXT: vmovdqa (%r8), %ymm2
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm2[0,2,1,1,4,6,5,5]
; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,3,3,2]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpblendvb %ymm6, %ymm5, %ymm4, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,0,4,4,4,4]
-; AVX2-NEXT: vpermd %ymm13, %ymm2, %ymm4
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm5 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
-; AVX2-NEXT: vpshufb %ymm5, %ymm11, %ymm0
+; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,0,4,4,4,4]
+; AVX2-NEXT: vpermd %ymm12, %ymm4, %ymm5
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm6 = [0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14,0,0,13,2,15,0,1,14]
+; AVX2-NEXT: vpshufb %ymm6, %ymm15, %ymm0
; AVX2-NEXT: vmovdqa {{.*#+}} ymm1 = [u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255,0,u,u,u,255]
-; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpermd %ymm9, %ymm2, %ymm2
-; AVX2-NEXT: vpshufb %ymm5, %ymm15, %ymm4
-; AVX2-NEXT: vpblendvb %ymm1, %ymm2, %ymm4, %ymm1
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
-; AVX2-NEXT: vpshufb %ymm2, %ymm14, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
-; AVX2-NEXT: vpshufb %ymm5, %ymm12, %ymm13
-; AVX2-NEXT: vpor %ymm4, %ymm13, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm13 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
-; AVX2-NEXT: vpblendvb %ymm13, %ymm0, %ymm4, %ymm0
-; AVX2-NEXT: vpshufb %ymm2, %ymm7, %ymm2
-; AVX2-NEXT: vpshufb %ymm5, %ymm3, %ymm4
-; AVX2-NEXT: vpor %ymm2, %ymm4, %ymm2
-; AVX2-NEXT: vpblendvb %ymm13, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm2 = [3,3,3,3,0,4,4,4]
-; AVX2-NEXT: vpermd %ymm10, %ymm2, %ymm4
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpblendvb %ymm1, %ymm5, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm13, %ymm4, %ymm4
+; AVX2-NEXT: vpshufb %ymm6, %ymm10, %ymm5
+; AVX2-NEXT: vpblendvb %ymm1, %ymm4, %ymm5, %ymm1
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128]
+; AVX2-NEXT: vpshufb %ymm4, %ymm14, %ymm5
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128,128,128]
+; AVX2-NEXT: vpshufb %ymm6, %ymm11, %ymm12
+; AVX2-NEXT: vpor %ymm5, %ymm12, %ymm5
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm12 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
+; AVX2-NEXT: vpblendvb %ymm12, %ymm0, %ymm5, %ymm0
+; AVX2-NEXT: vpshufb %ymm4, %ymm8, %ymm4
+; AVX2-NEXT: vpshufb %ymm6, %ymm3, %ymm5
+; AVX2-NEXT: vpor %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpblendvb %ymm12, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,3,0,4,4,4]
+; AVX2-NEXT: vpermd %ymm9, %ymm4, %ymm5
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm6 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
+; AVX2-NEXT: vpblendvb %ymm6, %ymm0, %ymm5, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-NEXT: vpermd %ymm8, %ymm2, %ymm0
-; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm0, %ymm0
+; AVX2-NEXT: vpermd %ymm2, %ymm4, %ymm0
+; AVX2-NEXT: vpblendvb %ymm6, %ymm1, %ymm0, %ymm0
; AVX2-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 16-byte Reload
-; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3],xmm0[4],xmm6[4],xmm0[5],xmm6[5],xmm0[6],xmm6[6],xmm0[7],xmm6[7]
+; AVX2-NEXT: vpunpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm13 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
-; AVX2-NEXT: vpshufb %xmm13, %xmm0, %xmm0
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm0[0,0,1,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} xmm6 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
-; AVX2-NEXT: vpshufb %xmm6, %xmm1, %xmm0
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm12 = [0,1,4,5,u,2,3,6,7,10,11,u,8,9,12,13]
+; AVX2-NEXT: vpshufb %xmm12, %xmm0, %xmm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm0[0,0,1,1]
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm7 = [2,u,1,0,5,4,u,3,u,7,6,11,10,u,9,8]
+; AVX2-NEXT: vpshufb %xmm7, %xmm1, %xmm0
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm0[0,0,1,1]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm0 = [255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255]
-; AVX2-NEXT: vpblendvb %ymm0, %ymm2, %ymm1, %ymm4
+; AVX2-NEXT: vpblendvb %ymm0, %ymm4, %ymm1, %ymm6
; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Reload
; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm1, %xmm1 # 16-byte Folded Reload
; AVX2-NEXT: # xmm1 = xmm1[0],mem[0],xmm1[1],mem[1],xmm1[2],mem[2],xmm1[3],mem[3],xmm1[4],mem[4],xmm1[5],mem[5],xmm1[6],mem[6],xmm1[7],mem[7]
-; AVX2-NEXT: vpshufb %xmm13, %xmm1, %xmm1
-; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
-; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
-; AVX2-NEXT: # xmm2 = xmm2[0],mem[0],xmm2[1],mem[1],xmm2[2],mem[2],xmm2[3],mem[3],xmm2[4],mem[4],xmm2[5],mem[5],xmm2[6],mem[6],xmm2[7],mem[7]
-; AVX2-NEXT: vpshufb %xmm6, %xmm2, %xmm2
+; AVX2-NEXT: vpshufb %xmm12, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm4 # 16-byte Reload
+; AVX2-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm4, %xmm4 # 16-byte Folded Reload
+; AVX2-NEXT: # xmm4 = xmm4[0],mem[0],xmm4[1],mem[1],xmm4[2],mem[2],xmm4[3],mem[3],xmm4[4],mem[4],xmm4[5],mem[5],xmm4[6],mem[6],xmm4[7],mem[7]
+; AVX2-NEXT: vpshufb %xmm7, %xmm4, %xmm4
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
-; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm2, %ymm0
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
+; AVX2-NEXT: vpblendvb %ymm0, %ymm1, %ymm4, %ymm0
; AVX2-NEXT: vpshufd $80, (%rsp), %xmm1 # 16-byte Folded Reload
; AVX2-NEXT: # xmm1 = mem[0,0,1,1]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm2 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm4, %ymm1, %ymm4
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
+; AVX2-NEXT: vpblendvb %ymm4, %ymm6, %ymm1, %ymm6
; AVX2-NEXT: vpshufd $80, {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
; AVX2-NEXT: # xmm1 = mem[0,0,1,1]
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,0,1]
-; AVX2-NEXT: vpblendvb %ymm2, %ymm0, %ymm1, %ymm6
+; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm7
; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm0 = [9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12,9,14,11,0,13,10,15,12]
; AVX2-NEXT: vpshufb %ymm0, %ymm14, %ymm1
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm12[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7]
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm4 = ymm11[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm4 = ymm4[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm5 = [255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0,255,0,255,0,0,255,0,255,0,255,0,0,255,0,255,0]
; AVX2-NEXT: # ymm5 = mem[0,1,0,1]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpshufb %ymm0, %ymm7, %ymm0
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm2 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm2 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
-; AVX2-NEXT: vpshufb %ymm2, %ymm11, %ymm3
+; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
+; AVX2-NEXT: vpshufb %ymm0, %ymm8, %ymm0
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm3 = ymm3[0,1,2,3,5,6,7,6,8,9,10,11,13,14,15,14]
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm3[2,2,3,3,6,6,7,7]
+; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpbroadcastq {{.*#+}} ymm3 = [11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14,11,0,13,10,15,12,0,14]
+; AVX2-NEXT: vpshufb %ymm3, %ymm15, %ymm4
; AVX2-NEXT: vpshufhw $230, {{[-0-9]+}}(%r{{[sb]}}p), %ymm5 # 32-byte Folded Reload
; AVX2-NEXT: # ymm5 = mem[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7]
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255]
-; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
-; AVX2-NEXT: vpblendvb %ymm7, %ymm3, %ymm5, %ymm3
-; AVX2-NEXT: vpshufb %ymm2, %ymm15, %ymm2
-; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm9[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255,255,0,0,255,0,255,0,0,0,0,255,0,255,0,0,255]
+; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
+; AVX2-NEXT: vpblendvb %ymm8, %ymm4, %ymm5, %ymm4
+; AVX2-NEXT: vpshufb %ymm3, %ymm10, %ymm3
+; AVX2-NEXT: vpshufhw {{.*#+}} ymm5 = ymm13[0,1,2,3,6,5,6,7,8,9,10,11,14,13,14,15]
; AVX2-NEXT: vpshufd {{.*#+}} ymm5 = ymm5[2,2,3,3,6,6,7,7]
-; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm5, %ymm2
+; AVX2-NEXT: vpblendvb %ymm8, %ymm3, %ymm5, %ymm3
; AVX2-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
-; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
; AVX2-NEXT: vmovdqa {{.*#+}} ymm5 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpblendvb %ymm5, %ymm1, %ymm4, %ymm1
; AVX2-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
+; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm3, %ymm0
+; AVX2-NEXT: vpshufd {{.*#+}} ymm3 = ymm9[2,2,3,3,6,6,7,7]
+; AVX2-NEXT: vpermq {{.*#+}} ymm3 = ymm3[2,2,3,3]
+; AVX2-NEXT: vmovdqa {{.*#+}} ymm4 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
+; AVX2-NEXT: vpblendvb %ymm4, %ymm1, %ymm3, %ymm1
+; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm2[2,2,3,3,6,6,7,7]
; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
-; AVX2-NEXT: vpblendvb %ymm5, %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm10[2,2,3,3,6,6,7,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
-; AVX2-NEXT: vmovdqa {{.*#+}} ymm3 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm1, %ymm2, %ymm1
-; AVX2-NEXT: vpshufd {{.*#+}} ymm2 = ymm8[2,2,3,3,6,6,7,7]
-; AVX2-NEXT: vpermq {{.*#+}} ymm2 = ymm2[2,2,3,3]
-; AVX2-NEXT: vpblendvb %ymm3, %ymm0, %ymm2, %ymm0
+; AVX2-NEXT: vpblendvb %ymm4, %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm2, 64(%r9)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
@@ -4311,11 +4311,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm2 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm2, 256(%r9)
; AVX2-NEXT: vmovdqa %ymm0, 128(%r9)
-; AVX2-NEXT: vmovdqa %ymm6, 160(%r9)
+; AVX2-NEXT: vmovdqa %ymm7, 160(%r9)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 192(%r9)
; AVX2-NEXT: vmovdqa %ymm1, 288(%r9)
-; AVX2-NEXT: vmovdqa %ymm4, (%r9)
+; AVX2-NEXT: vmovdqa %ymm6, (%r9)
; AVX2-NEXT: vmovups {{[-0-9]+}}(%r{{[sb]}}p), %ymm0 # 32-byte Reload
; AVX2-NEXT: vmovaps %ymm0, 32(%r9)
; AVX2-NEXT: addq $248, %rsp
@@ -4334,8 +4334,6 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpshufb %xmm0, %xmm1, %xmm1
; AVX2-FP-NEXT: vmovdqa (%rdx), %xmm3
; AVX2-FP-NEXT: vmovdqa %xmm3, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
-; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9
-; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FP-NEXT: vmovdqa {{.*#+}} xmm2 = [6,128,8,128,u,7,128,9,128,11,128,u,10,128,12,128]
; AVX2-FP-NEXT: vpshufb %xmm2, %xmm3, %xmm3
; AVX2-FP-NEXT: vpor %xmm1, %xmm3, %xmm1
@@ -4352,6 +4350,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255]
; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm1, %ymm4, %ymm1
+; AVX2-FP-NEXT: vmovdqa 32(%rdx), %xmm9
+; AVX2-FP-NEXT: vmovdqa %xmm9, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FP-NEXT: vmovdqa (%r8), %xmm4
; AVX2-FP-NEXT: vmovdqa %xmm4, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FP-NEXT: vpshufd {{.*#+}} xmm4 = xmm4[1,1,2,2]
@@ -4366,11 +4366,10 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpor %xmm0, %xmm1, %xmm0
; AVX2-FP-NEXT: vmovdqa 32(%rsi), %xmm2
; AVX2-FP-NEXT: vmovdqa %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-FP-NEXT: vpshufb %xmm3, %xmm4, %xmm1
; AVX2-FP-NEXT: vpshufb %xmm5, %xmm2, %xmm2
; AVX2-FP-NEXT: vpor %xmm1, %xmm2, %xmm1
-; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2
-; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,0,1,1]
; AVX2-FP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqa 32(%r8), %xmm1
@@ -4379,12 +4378,13 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm1 = ymm1[0,1,1,1]
; AVX2-FP-NEXT: vpblendvb %ymm7, %ymm0, %ymm1, %ymm0
; AVX2-FP-NEXT: vmovdqu %ymm0, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
+; AVX2-FP-NEXT: vmovdqa 32(%rcx), %ymm2
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm0 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,29,26,128,28,128,30,128,28,29,128,31,128,29]
; AVX2-FP-NEXT: vpshufb %ymm0, %ymm12, %ymm1
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
; AVX2-FP-NEXT: # ymm8 = mem[0,1,0,1]
; AVX2-FP-NEXT: vpshufb %ymm8, %ymm2, %ymm3
-; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm14
+; AVX2-FP-NEXT: vmovdqa %ymm2, %ymm13
; AVX2-FP-NEXT: vmovdqu %ymm2, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FP-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-FP-NEXT: vmovdqa 32(%rdi), %ymm4
@@ -4399,8 +4399,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
; AVX2-FP-NEXT: vpblendvb %ymm10, %ymm1, %ymm6, %ymm3
-; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm13
-; AVX2-FP-NEXT: vpshufb %ymm0, %ymm13, %ymm0
+; AVX2-FP-NEXT: vmovdqa (%rdx), %ymm14
+; AVX2-FP-NEXT: vpshufb %ymm0, %ymm14, %ymm0
; AVX2-FP-NEXT: vmovdqa (%rcx), %ymm7
; AVX2-FP-NEXT: vpshufb %ymm8, %ymm7, %ymm1
; AVX2-FP-NEXT: vpor %ymm0, %ymm1, %ymm8
@@ -4431,23 +4431,23 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vpor %ymm9, %ymm15, %ymm9
; AVX2-FP-NEXT: vbroadcasti128 {{.*#+}} ymm15 = [128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128]
; AVX2-FP-NEXT: # ymm15 = mem[0,1,0,1]
-; AVX2-FP-NEXT: vpshufb %ymm15, %ymm14, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm15, %ymm13, %ymm0
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,25]
-; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm14
-; AVX2-FP-NEXT: vpor %ymm0, %ymm14, %ymm0
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm12, %ymm13
+; AVX2-FP-NEXT: vpor %ymm0, %ymm13, %ymm0
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm9 = ymm9[2,2,3,3]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
-; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm14 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm9, %ymm0, %ymm0
+; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm13 = [255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0]
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm9, %ymm0, %ymm0
; AVX2-FP-NEXT: vpshufb %ymm8, %ymm2, %ymm8
; AVX2-FP-NEXT: vpshufb %ymm10, %ymm1, %ymm9
; AVX2-FP-NEXT: vpor %ymm8, %ymm9, %ymm8
; AVX2-FP-NEXT: vpshufb %ymm15, %ymm7, %ymm9
-; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm6
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm14, %ymm6
; AVX2-FP-NEXT: vpor %ymm6, %ymm9, %ymm6
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,2,3,3]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
-; AVX2-FP-NEXT: vpblendvb %ymm14, %ymm8, %ymm6, %ymm6
+; AVX2-FP-NEXT: vpblendvb %ymm13, %ymm8, %ymm6, %ymm6
; AVX2-FP-NEXT: vpshufd {{.*#+}} ymm8 = ymm5[0,2,1,1,4,6,5,5]
; AVX2-FP-NEXT: vpermq {{.*#+}} ymm8 = ymm8[2,3,3,2]
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm9 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
@@ -4473,7 +4473,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FP-NEXT: vmovdqa {{.*#+}} ymm8 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm4, %ymm2, %ymm2
; AVX2-FP-NEXT: vpshufb %ymm1, %ymm7, %ymm1
-; AVX2-FP-NEXT: vpshufb %ymm6, %ymm13, %ymm4
+; AVX2-FP-NEXT: vpshufb %ymm6, %ymm14, %ymm4
; AVX2-FP-NEXT: vpor %ymm1, %ymm4, %ymm1
; AVX2-FP-NEXT: vpblendvb %ymm8, %ymm0, %ymm1, %ymm1
; AVX2-FP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,3,0,4,4,4]
@@ -4539,7 +4539,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: subq $168, %rsp
; AVX2-FCP-NEXT: vmovdqa 32(%rdx), %ymm14
; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %ymm9
-; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm11
+; AVX2-FCP-NEXT: vmovdqa (%r8), %ymm12
; AVX2-FCP-NEXT: vmovdqa (%rcx), %xmm1
; AVX2-FCP-NEXT: vmovdqa %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
; AVX2-FCP-NEXT: vmovdqa 32(%rcx), %xmm6
@@ -4580,7 +4580,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm0 = ymm0[0,0,1,1]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,0,1,1]
; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm0, %ymm2, %ymm0
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm2
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm2
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255]
; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX2-FCP-NEXT: vmovdqu %ymm1, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
@@ -4608,8 +4608,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm7 = [255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u,0,0,255,255,u]
; AVX2-FCP-NEXT: vpblendvb %ymm7, %ymm1, %ymm4, %ymm4
-; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm12
-; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm12, %ymm0
+; AVX2-FCP-NEXT: vmovdqa (%rdx), %ymm11
+; AVX2-FCP-NEXT: vpshufb %ymm0, %ymm11, %ymm0
; AVX2-FCP-NEXT: vmovdqa (%rcx), %ymm5
; AVX2-FCP-NEXT: vpshufb %ymm10, %ymm5, %ymm1
; AVX2-FCP-NEXT: vpor %ymm0, %ymm1, %ymm10
@@ -4626,7 +4626,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0]
; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm9, %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm8, %ymm4
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm8, %ymm4
; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm4, %ymm4
; AVX2-FCP-NEXT: vmovdqu %ymm4, {{[-0-9]+}}(%r{{[sb]}}p) # 32-byte Spill
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,19,20,128,22,128,24,128,22,23,128,25,128,23]
@@ -4649,7 +4649,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpshufb %ymm8, %ymm0, %ymm8
; AVX2-FCP-NEXT: vpor %ymm6, %ymm8, %ymm6
; AVX2-FCP-NEXT: vpshufb %ymm9, %ymm5, %ymm8
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm4
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm4
; AVX2-FCP-NEXT: vpor %ymm4, %ymm8, %ymm4
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm6 = ymm6[2,2,3,3]
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[2,2,3,3]
@@ -4658,7 +4658,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpermd %ymm13, %ymm6, %ymm8
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm10 = [255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255]
; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm7, %ymm8, %ymm9
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm6, %ymm6
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm6, %ymm6
; AVX2-FCP-NEXT: vpblendvb %ymm10, %ymm4, %ymm6, %ymm7
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [3,3,3,0,4,4,4,4]
; AVX2-FCP-NEXT: vpermd %ymm2, %ymm4, %ymm2
@@ -4678,14 +4678,14 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255,255,0,0,u,255]
; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm3, %ymm2
; AVX2-FCP-NEXT: vpshufb %ymm1, %ymm5, %ymm1
-; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm12, %ymm3
+; AVX2-FCP-NEXT: vpshufb %ymm4, %ymm11, %ymm3
; AVX2-FCP-NEXT: vpor %ymm1, %ymm3, %ymm1
; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm0, %ymm1, %ymm1
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm3 = [3,3,3,3,0,4,4,4]
; AVX2-FCP-NEXT: vpermd %ymm13, %ymm3, %ymm0
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm4 = [0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm2, %ymm0, %ymm0
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm3, %ymm2
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm3, %ymm2
; AVX2-FCP-NEXT: vpblendvb %ymm4, %ymm1, %ymm2, %ymm1
; AVX2-FCP-NEXT: vmovdqa {{[-0-9]+}}(%r{{[sb]}}p), %xmm2 # 16-byte Reload
; AVX2-FCP-NEXT: vpunpcklbw {{[-0-9]+}}(%r{{[sb]}}p), %xmm2, %xmm2 # 16-byte Folded Reload
@@ -4713,7 +4713,7 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX2-FCP-NEXT: vpermq {{.*#+}} ymm4 = ymm4[0,0,1,1]
; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm3, %ymm4, %ymm3
; AVX2-FCP-NEXT: vpmovsxbd {{.*#+}} ymm4 = [0,0,0,0,0,0,1,1]
-; AVX2-FCP-NEXT: vpermd %ymm11, %ymm4, %ymm5
+; AVX2-FCP-NEXT: vpermd %ymm12, %ymm4, %ymm5
; AVX2-FCP-NEXT: vmovdqa {{.*#+}} ymm6 = [255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255]
; AVX2-FCP-NEXT: vpblendvb %ymm6, %ymm2, %ymm5, %ymm2
; AVX2-FCP-NEXT: vpermd %ymm13, %ymm4, %ymm4
@@ -4840,12 +4840,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpshufb %xmm13, %xmm3, %xmm13
; AVX512-NEXT: vpor %xmm2, %xmm13, %xmm13
; AVX512-NEXT: vmovdqa {{.*#+}} ymm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm2
-; AVX512-NEXT: vpshufd {{.*#+}} xmm15 = mem[1,1,2,2]
-; AVX512-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1]
+; AVX512-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,2]
+; AVX512-NEXT: vpshufb %ymm14, %ymm11, %ymm15
+; AVX512-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1]
; AVX512-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512-NEXT: vpandnq %ymm15, %ymm28, %ymm15
-; AVX512-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2
+; AVX512-NEXT: vpandnq %ymm2, %ymm28, %ymm2
+; AVX512-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2
; AVX512-NEXT: vmovdqa (%r8), %ymm15
; AVX512-NEXT: vpshufb %ymm14, %ymm15, %ymm14
; AVX512-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5]
@@ -4891,12 +4891,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512-NEXT: vpermq {{.*#+}} ymm7 = ymm19[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7
; AVX512-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5))
+; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
; AVX512-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
; AVX512-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
; AVX512-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1
; AVX512-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm9 & (zmm1 ^ zmm0))
-; AVX512-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
; AVX512-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm1))
; AVX512-NEXT: vpermq {{.*#+}} zmm0 = zmm4[0,0,1,1,4,4,5,5]
; AVX512-NEXT: vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5]
@@ -5183,12 +5183,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpshufb %xmm13, %xmm3, %xmm13
; AVX512DQ-NEXT: vpor %xmm2, %xmm13, %xmm13
; AVX512DQ-NEXT: vmovdqa {{.*#+}} ymm14 = [12,128,128,128,128,13,128,128,128,128,14,128,128,128,128,15,128,128,128,128,16,128,128,128,128,17,128,128,128,128,18,128]
-; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm2
-; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm15 = mem[1,1,2,2]
-; AVX512DQ-NEXT: vpermq {{.*#+}} ymm15 = ymm15[0,1,1,1]
+; AVX512DQ-NEXT: vpshufd {{.*#+}} xmm2 = mem[1,1,2,2]
+; AVX512DQ-NEXT: vpshufb %ymm14, %ymm11, %ymm15
+; AVX512DQ-NEXT: vpermq {{.*#+}} ymm2 = ymm2[0,1,1,1]
; AVX512DQ-NEXT: vmovdqa64 {{.*#+}} zmm28 = [255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255,255,255,255,0,255]
-; AVX512DQ-NEXT: vpandnq %ymm15, %ymm28, %ymm15
-; AVX512DQ-NEXT: vinserti64x4 $1, %ymm2, %zmm15, %zmm2
+; AVX512DQ-NEXT: vpandnq %ymm2, %ymm28, %ymm2
+; AVX512DQ-NEXT: vinserti64x4 $1, %ymm15, %zmm2, %zmm2
; AVX512DQ-NEXT: vmovdqa (%r8), %ymm15
; AVX512DQ-NEXT: vpshufb %ymm14, %ymm15, %ymm14
; AVX512DQ-NEXT: vpshufd {{.*#+}} ymm15 = ymm15[0,2,1,1,4,6,5,5]
@@ -5234,12 +5234,12 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm7 = ymm19[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm7, %zmm18, %zmm7
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm7 = zmm5 ^ (zmm8 & (zmm7 ^ zmm5))
+; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm0 = ymm0[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm6, %zmm0, %zmm0
; AVX512DQ-NEXT: vpermq {{.*#+}} ymm1 = ymm1[2,2,3,3]
; AVX512DQ-NEXT: vinserti64x4 $1, %ymm10, %zmm1, %zmm1
; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm1 = zmm0 ^ (zmm9 & (zmm1 ^ zmm0))
-; AVX512DQ-NEXT: vpternlogq {{.*#+}} zmm14 = zmm14 | (zmm7 & mem)
; AVX512DQ-NEXT: vpternlogd {{.*#+}} zmm31 = zmm31 ^ (mem & (zmm31 ^ zmm1))
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm0 = zmm4[0,0,1,1,4,4,5,5]
; AVX512DQ-NEXT: vpermq {{.*#+}} zmm1 = zmm3[0,0,1,1,4,4,5,5]
@@ -5652,11 +5652,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
; AVX512BW-FCP-NEXT: vpshufb %zmm19, %zmm24, %zmm24
; AVX512BW-FCP-NEXT: vporq %zmm23, %zmm24, %zmm23
-; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,3,3,6,6,7,7]
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,3,3,6,6,7,7]
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
-; AVX512BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21
; AVX512BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21
; AVX512BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
; AVX512BW-FCP-NEXT: vpshufb %zmm25, %zmm8, %zmm8
; AVX512BW-FCP-NEXT: vporq %zmm21, %zmm8, %zmm8
@@ -5727,8 +5727,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4
; AVX512BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
+; AVX512BW-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[26],zero,ymm18[28],zero,zero,ymm18[27],zero,ymm18[29],zero,ymm18[31],zero,zero,ymm18[30],zero
; AVX512BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3
@@ -5976,11 +5976,11 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm19 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,21,128,128,20,128,22,128,24,128,128,23,128,25,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,26,128,28,128,128,27,128,29,128,31,128,128,30,128]
; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm19, %zmm24, %zmm24
; AVX512DQ-BW-FCP-NEXT: vporq %zmm23, %zmm24, %zmm23
-; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,3,3,6,6,7,7]
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm21 = zmm21[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpermq {{.*#+}} zmm23 = zmm23[2,2,3,3,6,6,7,7]
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm24 = [128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,18,19,128,21,128,21,20,128,22,128,24,128,22,23,128,25,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,25,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21
; AVX512DQ-BW-FCP-NEXT: vshufi64x2 {{.*#+}} zmm8 = zmm8[0,1,2,3],mem[4,5,6,7]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm24, %zmm21, %zmm21
; AVX512DQ-BW-FCP-NEXT: vmovdqa64 {{.*#+}} zmm25 = [u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,128,128,19,128,21,128,128,20,128,22,128,24,128,128,23,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,128,27,128,128,26,128,28,128,30,128,128,29,128,31,128,128]
; AVX512DQ-BW-FCP-NEXT: vpshufb %zmm25, %zmm8, %zmm8
; AVX512DQ-BW-FCP-NEXT: vporq %zmm21, %zmm8, %zmm8
@@ -6051,8 +6051,8 @@ define void @store_i8_stride5_vf64(ptr %in.vecptr0, ptr %in.vecptr1, ptr %in.vec
; AVX512DQ-BW-FCP-NEXT: vpmovsxbq {{.*#+}} zmm6 = [2,2,3,3,8,8,9,9]
; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm4
; AVX512DQ-BW-FCP-NEXT: vpunpcklbw {{.*#+}} xmm2 = xmm3[0],xmm5[0],xmm3[1],xmm5[1],xmm3[2],xmm5[2],xmm3[3],xmm5[3],xmm3[4],xmm5[4],xmm3[5],xmm5[5],xmm3[6],xmm5[6],xmm3[7],xmm5[7]
-; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm3 = ymm7[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,27],zero,zero,ymm7[26],zero,ymm7[28],zero,zero,zero,zero,ymm7[29],zero,ymm7[31],zero,zero,ymm7[30]
+; AVX512DQ-BW-FCP-NEXT: vpshufb %xmm15, %xmm2, %xmm2
; AVX512DQ-BW-FCP-NEXT: vpshufb {{.*#+}} ymm5 = ymm18[u,u,u,u,u,u,u,u,u,u,u,u,u,u,u,u],zero,zero,ymm18[26],zero,ymm18[28],zero,zero,ymm18[27],zero,ymm18[29],zero,ymm18[31],zero,zero,ymm18[30],zero
; AVX512DQ-BW-FCP-NEXT: vpor %ymm3, %ymm5, %ymm3
; AVX512DQ-BW-FCP-NEXT: vpermt2q %zmm2, %zmm6, %zmm3
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 9cd0f4d12e15ab..346a91afedcfc4 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -1,11 +1,11 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse2 | FileCheck %s --check-prefixes=SSE,SSE2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+sse4.1 | FileCheck %s --check-prefixes=SSE,SSE41
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f | FileCheck %s --check-prefixes=AVX,AVX512,AVX512F
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f,+avx512bw | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BW
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -mattr=+avx512f,+avx512bw,+avx512vl | FileCheck %s --check-prefixes=AVX,AVX512,AVX512BWVL
;
; vXi64
diff --git a/llvm/test/CodeGen/X86/win-smallparams.ll b/llvm/test/CodeGen/X86/win-smallparams.ll
index 5ca8f6705479fd..05e5d1f3ea30d9 100644
--- a/llvm/test/CodeGen/X86/win-smallparams.ll
+++ b/llvm/test/CodeGen/X86/win-smallparams.ll
@@ -2,10 +2,10 @@
; When we accept small parameters on Windows, make sure we do not assume they
; are zero or sign extended in memory or in registers.
-; RUN: llc < %s -mtriple=x86_64-windows-msvc | FileCheck %s --check-prefix=WIN64
-; RUN: llc < %s -mtriple=x86_64-windows-gnu | FileCheck %s --check-prefix=WIN64
-; RUN: llc < %s -mtriple=i686-windows-msvc | FileCheck %s --check-prefix=WIN32-MSVC
-; RUN: llc < %s -mtriple=i686-windows-gnu | FileCheck %s --check-prefix=WIN32-GNU
+; RUN: llc < %s -mtriple=x86_64-windows-msvc -mcpu=generic | FileCheck %s --check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-windows-gnu -mcpu=generic | FileCheck %s --check-prefix=WIN64
+; RUN: llc < %s -mtriple=i686-windows-msvc -mcpu=generic | FileCheck %s --check-prefix=WIN32-MSVC
+; RUN: llc < %s -mtriple=i686-windows-gnu -mcpu=generic | FileCheck %s --check-prefix=WIN32-GNU
define void @call() {
; WIN64-LABEL: call:
diff --git a/llvm/test/CodeGen/X86/x86-interleaved-access.ll b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
index 49947eddc61b9d..43c81769dc5668 100644
--- a/llvm/test/CodeGen/X86/x86-interleaved-access.ll
+++ b/llvm/test/CodeGen/X86/x86-interleaved-access.ll
@@ -1,7 +1,7 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --no_x86_scrub_mem_shuffle
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512,AVX2
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=x86-64 -mattr=+avx | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX1
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=x86-64 -mattr=+avx2 | FileCheck %s --check-prefixes=AVX,AVX1OR2,AVX2OR512,AVX2
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mcpu=x86-64 -mattr=+avx512f -mattr=+avx512bw | FileCheck %s --check-prefixes=AVX,AVX2OR512,AVX512
define <4 x double> @load_factorf64_4(ptr %ptr) nounwind {
; AVX1-LABEL: load_factorf64_4:
@@ -723,25 +723,25 @@ define <32 x i1> @interleaved_load_vf32_i8_stride4(ptr %ptr) nounwind {
; AVX512-LABEL: interleaved_load_vf32_i8_stride4:
; AVX512: # %bb.0:
; AVX512-NEXT: vpmovsxbd {{.*#+}} ymm0 = [0,4,8,12,1,5,9,13]
-; AVX512-NEXT: vmovdqa64 (%rdi), %zmm1
-; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm2
-; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zero,zero,zero,zero,zmm2[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[48,52,56,60,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm1[0,4,8,12],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vporq %zmm3, %zmm4, %zmm3
-; AVX512-NEXT: vpermd %zmm3, %zmm0, %zmm3
-; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm2[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[49,53,57,61,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm1[1,5,9,13],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 64(%rdi), %zmm1
+; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm1[0,4,8,12,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[16,20,24,28,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[32,36,40,44,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[48,52,56,60,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vmovdqa64 (%rdi), %zmm3
+; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zmm3[0,4,8,12],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,16,20,24,28],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,32,36,40,44],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,48,52,56,60],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512-NEXT: vpshufb {{.*#+}} zmm4 = zero,zero,zero,zero,zmm1[1,5,9,13,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[17,21,25,29,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[33,37,41,45,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[49,53,57,61,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zmm3[1,5,9,13],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,17,21,25,29],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,33,37,41,45],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,49,53,57,61],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpermd %zmm2, %zmm0, %zmm2
; AVX512-NEXT: vporq %zmm4, %zmm5, %zmm4
+; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm1[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[50,54,58,62,u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpermd %zmm4, %zmm0, %zmm4
-; AVX512-NEXT: vpshufb {{.*#+}} zmm5 = zero,zero,zero,zero,zmm2[2,6,10,14,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[18,22,26,30,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[34,38,42,46,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[50,54,58,62,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm1[2,6,10,14],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} zmm6 = zmm3[2,6,10,14],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,18,22,26,30],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,34,38,42,46],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,50,54,58,62],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vporq %zmm5, %zmm6, %zmm5
+; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zero,zero,zero,zero,zmm1[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm1[51,55,59,63,u,u,u,u,u,u,u,u]
+; AVX512-NEXT: vpshufb {{.*#+}} zmm3 = zmm3[3,7,11,15],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm3[u,u,u,u,u,u,u,u]
; AVX512-NEXT: vpermd %zmm5, %zmm0, %zmm5
-; AVX512-NEXT: vpshufb {{.*#+}} zmm2 = zero,zero,zero,zero,zmm2[3,7,11,15,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[19,23,27,31,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[35,39,43,47,u,u,u,u,u,u,u,u],zero,zero,zero,zero,zmm2[51,55,59,63,u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vpshufb {{.*#+}} zmm1 = zmm1[3,7,11,15],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,19,23,27,31],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,35,39,43,47],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u,51,55,59,63],zero,zero,zero,zero,zmm1[u,u,u,u,u,u,u,u]
-; AVX512-NEXT: vporq %zmm2, %zmm1, %zmm1
+; AVX512-NEXT: vporq %zmm1, %zmm3, %zmm1
; AVX512-NEXT: vpermd %zmm1, %zmm0, %zmm0
-; AVX512-NEXT: vpcmpeqb %zmm4, %zmm3, %k0
+; AVX512-NEXT: vpcmpeqb %zmm4, %zmm2, %k0
; AVX512-NEXT: vpcmpeqb %zmm0, %zmm5, %k1
; AVX512-NEXT: kxnord %k1, %k0, %k0
; AVX512-NEXT: vpmovm2b %k0, %zmm0
@@ -843,10 +843,10 @@ define <32 x i8> @interleaved_load_vf32_i8_stride3(ptr %ptr){
; AVX2OR512-NEXT: vbroadcasti128 {{.*#+}} ymm4 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
; AVX2OR512-NEXT: # ymm4 = mem[0,1,0,1]
; AVX2OR512-NEXT: vpblendvb %ymm4, %ymm0, %ymm1, %ymm1
-; AVX2OR512-NEXT: vpaddb %ymm1, %ymm2, %ymm1
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[11,12,13,14,15],ymm3[0,1,2,3,4,5,6,7,8,9,10],ymm0[27,28,29,30,31],ymm3[16,17,18,19,20,21,22,23,24,25,26]
; AVX2OR512-NEXT: vpalignr {{.*#+}} ymm0 = ymm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,26,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25]
-; AVX2OR512-NEXT: vpaddb %ymm0, %ymm1, %ymm0
+; AVX2OR512-NEXT: vpaddb %ymm1, %ymm0, %ymm0
+; AVX2OR512-NEXT: vpaddb %ymm0, %ymm2, %ymm0
; AVX2OR512-NEXT: retq
%wide.vec = load <96 x i8>, ptr %ptr
%v1 = shufflevector <96 x i8> %wide.vec, <96 x i8> undef,<32 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42,i32 45,i32 48,i32 51,i32 54,i32 57,i32 60,i32 63,i32 66,i32 69,i32 72,i32 75,i32 78,i32 81,i32 84,i32 87,i32 90,i32 93>
@@ -873,10 +873,10 @@ define <16 x i8> @interleaved_load_vf16_i8_stride3(ptr %ptr){
; AVX-NEXT: vpalignr {{.*#+}} xmm2 = xmm3[11,12,13,14,15],xmm1[0,1,2,3,4,5,6,7,8,9,10]
; AVX-NEXT: vpmovsxdq {{.*#+}} xmm4 = [18446744073709551615,16777215]
; AVX-NEXT: vpblendvb %xmm4, %xmm0, %xmm1, %xmm1
-; AVX-NEXT: vpaddb %xmm1, %xmm2, %xmm1
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[11,12,13,14,15],xmm3[0,1,2,3,4,5,6,7,8,9,10]
; AVX-NEXT: vpalignr {{.*#+}} xmm0 = xmm0[10,11,12,13,14,15,0,1,2,3,4,5,6,7,8,9]
-; AVX-NEXT: vpaddb %xmm0, %xmm1, %xmm0
+; AVX-NEXT: vpaddb %xmm1, %xmm0, %xmm0
+; AVX-NEXT: vpaddb %xmm0, %xmm2, %xmm0
; AVX-NEXT: retq
%wide.vec = load <48 x i8>, ptr %ptr
%v1 = shufflevector <48 x i8> %wide.vec, <48 x i8> undef,<16 x i32> <i32 0,i32 3,i32 6,i32 9,i32 12,i32 15,i32 18,i32 21,i32 24,i32 27,i32 30,i32 33,i32 36,i32 39,i32 42 ,i32 45>
@@ -894,13 +894,13 @@ define <8 x i8> @interleaved_load_vf8_i8_stride3(ptr %ptr){
; AVX-NEXT: vmovdqa 16(%rdi), %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,zero,zero,zero,zero,zero,xmm1[2,5,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[0,3,6,9,12,15],zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpshufb {{.*#+}} xmm4 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpor %xmm2, %xmm3, %xmm2
-; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,zero,zero,zero,xmm1[0,3,6,u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpshufb {{.*#+}} xmm4 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
-; AVX-NEXT: vpor %xmm3, %xmm4, %xmm3
-; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
+; AVX-NEXT: vpshufb {{.*#+}} xmm3 = xmm0[1,4,7,10,13],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm4, %xmm3, %xmm3
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = zero,zero,zero,zero,zero,xmm1[1,4,7,u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[2,5,8,11,14],zero,zero,zero,xmm0[u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpaddb %xmm3, %xmm2, %xmm2
; AVX-NEXT: vpor %xmm1, %xmm0, %xmm0
; AVX-NEXT: vpaddb %xmm2, %xmm0, %xmm0
; AVX-NEXT: retq
@@ -919,9 +919,9 @@ define void @interleaved_store_vf8_i8_stride3(<8 x i8> %a, <8 x i8> %b, <8 x i8>
; AVX-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX-NEXT: vpshufb {{.*#+}} xmm1 = xmm0[0,8],zero,xmm0[1,9],zero,xmm0[2,10],zero,xmm0[3,11],zero,xmm0[4,12],zero,xmm0[5]
; AVX-NEXT: vpshufb {{.*#+}} xmm3 = zero,zero,xmm2[0],zero,zero,xmm2[1],zero,zero,xmm2[2],zero,zero,xmm2[3],zero,zero,xmm2[4],zero
-; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpshufb {{.*#+}} xmm0 = xmm0[13],zero,xmm0[6,14],zero,xmm0[7,15],zero,xmm0[u,u,u,u,u,u,u,u]
; AVX-NEXT: vpshufb {{.*#+}} xmm2 = zero,xmm2[5],zero,zero,xmm2[6],zero,zero,xmm2[7,u,u,u,u,u,u,u,u]
+; AVX-NEXT: vpor %xmm3, %xmm1, %xmm1
; AVX-NEXT: vpor %xmm2, %xmm0, %xmm0
; AVX-NEXT: vmovq %xmm0, 16(%rdi)
; AVX-NEXT: vmovdqu %xmm1, (%rdi)
@@ -1182,13 +1182,13 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX2-LABEL: interleaved_store_vf64_i8_stride3:
; AVX2: # %bb.0:
; AVX2-NEXT: vpalignr {{.*#+}} ymm6 = ymm0[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
-; AVX2-NEXT: vpslldq {{.*#+}} ymm7 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20]
-; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm8 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
-; AVX2-NEXT: # ymm8 = mem[0,1,0,1]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm6, %ymm7, %ymm7
+; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0]
+; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
+; AVX2-NEXT: vpslldq {{.*#+}} ymm8 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20]
+; AVX2-NEXT: vpblendvb %ymm7, %ymm6, %ymm8, %ymm8
; AVX2-NEXT: vpalignr {{.*#+}} ymm9 = ymm1[11,12,13,14,15,0,1,2,3,4,5,6,7,8,9,10,27,28,29,30,31,16,17,18,19,20,21,22,23,24,25,26]
; AVX2-NEXT: vpslldq {{.*#+}} ymm10 = zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4],zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm9, %ymm10, %ymm10
+; AVX2-NEXT: vpblendvb %ymm7, %ymm9, %ymm10, %ymm10
; AVX2-NEXT: vpslldq {{.*#+}} ymm0 = zero,zero,zero,zero,zero,ymm0[0,1,2,3,4,5,6,7,8,9,10],zero,zero,zero,zero,zero,ymm0[16,17,18,19,20,21,22,23,24,25,26]
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm11 = [0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0,0,0,0,0,0,255,255,255,255,255,255,0,0,0,0,0]
; AVX2-NEXT: # ymm11 = mem[0,1,0,1]
@@ -1200,13 +1200,13 @@ define void @interleaved_store_vf64_i8_stride3(<64 x i8> %a, <64 x i8> %b, <64 x
; AVX2-NEXT: vpsrldq {{.*#+}} ymm13 = ymm5[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm5[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
; AVX2-NEXT: vpblendvb %ymm11, %ymm13, %ymm3, %ymm11
; AVX2-NEXT: vpalignr {{.*#+}} ymm3 = ymm10[5,6,7,8,9,10,11,12,13,14,15],ymm3[0,1,2,3,4],ymm10[21,22,23,24,25,26,27,28,29,30,31],ymm3[16,17,18,19,20]
-; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm7[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm7[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm8[5,6,7,8,9,10,11,12,13,14,15],ymm2[0,1,2,3,4],ymm8[21,22,23,24,25,26,27,28,29,30,31],ymm2[16,17,18,19,20]
; AVX2-NEXT: vpsrldq {{.*#+}} ymm1 = ymm1[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm1[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
; AVX2-NEXT: vpslldq {{.*#+}} ymm5 = zero,zero,zero,zero,zero,zero,ymm5[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm5[16,17,18,19,20,21,22,23,24,25]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm1, %ymm5, %ymm1
+; AVX2-NEXT: vpblendvb %ymm7, %ymm1, %ymm5, %ymm1
; AVX2-NEXT: vpsrldq {{.*#+}} ymm0 = ymm0[5,6,7,8,9,10,11,12,13,14,15],zero,zero,zero,zero,zero,ymm0[21,22,23,24,25,26,27,28,29,30,31],zero,zero,zero,zero,zero
; AVX2-NEXT: vpslldq {{.*#+}} ymm4 = zero,zero,zero,zero,zero,zero,ymm4[0,1,2,3,4,5,6,7,8,9],zero,zero,zero,zero,zero,zero,ymm4[16,17,18,19,20,21,22,23,24,25]
-; AVX2-NEXT: vpblendvb %ymm8, %ymm0, %ymm4, %ymm0
+; AVX2-NEXT: vpblendvb %ymm7, %ymm0, %ymm4, %ymm0
; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm11[5,6,7,8,9,10,11,12,13,14,15],ymm9[0,1,2,3,4],ymm11[21,22,23,24,25,26,27,28,29,30,31],ymm9[16,17,18,19,20]
; AVX2-NEXT: vpalignr {{.*#+}} ymm5 = ymm12[5,6,7,8,9,10,11,12,13,14,15],ymm6[0,1,2,3,4],ymm12[21,22,23,24,25,26,27,28,29,30,31],ymm6[16,17,18,19,20]
; AVX2-NEXT: vinserti128 $1, %xmm0, %ymm2, %ymm6
@@ -1436,10 +1436,10 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX2-NEXT: vbroadcasti128 {{.*#+}} ymm7 = [255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255,255,255,255,255,255,255,0,0,0,0,0,255,255,255,255,255]
; AVX2-NEXT: # ymm7 = mem[0,1,0,1]
; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm0, %ymm0
-; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0
-; AVX2-NEXT: vpalignr {{.*#+}} ymm2 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
+; AVX2-NEXT: vpalignr {{.*#+}} ymm4 = ymm5[5,6,7,8,9,10,11,12,13,14,15,0,1,2,3,4,21,22,23,24,25,26,27,28,29,30,31,16,17,18,19,20]
; AVX2-NEXT: vpshufb %ymm6, %ymm1, %ymm1
-; AVX2-NEXT: vpblendvb %ymm7, %ymm2, %ymm1, %ymm1
+; AVX2-NEXT: vpblendvb %ymm7, %ymm4, %ymm1, %ymm1
+; AVX2-NEXT: vpaddb %ymm0, %ymm2, %ymm0
; AVX2-NEXT: vpaddb %ymm1, %ymm3, %ymm1
; AVX2-NEXT: retq
;
@@ -1453,10 +1453,10 @@ define <64 x i8> @interleaved_load_vf64_i8_stride3(ptr %ptr){
; AVX512-NEXT: vmovdqu 128(%rdi), %xmm5
; AVX512-NEXT: vinserti128 $1, 144(%rdi), %ymm3, %ymm3
; AVX512-NEXT: vinserti128 $1, 48(%rdi), %ymm0, %ymm0
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
-; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm3
+; AVX512-NEXT: vinserti128 $1, 160(%rdi), %ymm4, %ymm4
; AVX512-NEXT: vinserti128 $1, 64(%rdi), %ymm1, %ymm1
-; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm1, %zmm1
+; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm0, %zmm0
+; AVX512-NEXT: vinserti64x4 $1, %ymm4, %zmm1, %zmm1
; AVX512-NEXT: vinserti128 $1, 176(%rdi), %ymm5, %ymm3
; AVX512-NEXT: vinserti128 $1, 80(%rdi), %ymm2, %ymm2
; AVX512-NEXT: vinserti64x4 $1, %ymm3, %zmm2, %zmm2
@@ -1632,8 +1632,8 @@ define void @splat2_v4f64_load_store(ptr %s, ptr %d) nounwind {
; AVX1-LABEL: splat2_v4f64_load_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
; AVX1-NEXT: vmovupd %ymm1, (%rsi)
@@ -1669,8 +1669,8 @@ define void @splat2_v4i64_load_store(ptr %s, ptr %d) nounwind {
; AVX1-LABEL: splat2_v4i64_load_store:
; AVX1: # %bb.0:
; AVX1-NEXT: vperm2f128 $51, (%rdi), %ymm0, %ymm0 # ymm0 = mem[2,3,2,3]
-; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
; AVX1-NEXT: vbroadcastf128 (%rdi), %ymm1 # ymm1 = mem[0,1,0,1]
+; AVX1-NEXT: vshufpd {{.*#+}} ymm0 = ymm0[0,0,3,3]
; AVX1-NEXT: vshufpd {{.*#+}} ymm1 = ymm1[0,0,3,3]
; AVX1-NEXT: vmovupd %ymm0, 32(%rsi)
; AVX1-NEXT: vmovupd %ymm1, (%rsi)
diff --git a/llvm/test/CodeGen/X86/xmulo.ll b/llvm/test/CodeGen/X86/xmulo.ll
index 2169b39b9dfa05..8e7c39cc154ca7 100644
--- a/llvm/test/CodeGen/X86/xmulo.ll
+++ b/llvm/test/CodeGen/X86/xmulo.ll
@@ -1,9 +1,9 @@
; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown < %s | FileCheck %s --check-prefixes=CHECK,LINUX,SDAG
-; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefixes=CHECK,LINUX,FAST
+; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown -mcpu=x86-64 < %s | FileCheck %s --check-prefixes=CHECK,LINUX,SDAG
+; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown -mcpu=x86-64 -fast-isel -fast-isel-abort=1 < %s | FileCheck %s --check-prefixes=CHECK,LINUX,FAST
; RUN: llc -disable-peephole -mtriple=x86_64-linux-unknown -mcpu=knl < %s | FileCheck %s --check-prefixes=CHECK,LINUX,SDAG
-; RUN: llc -disable-peephole -mtriple=x86_64-pc-win32 < %s | FileCheck %s --check-prefixes=CHECK,WIN64
-; RUN: llc -disable-peephole -mtriple=i386-pc-win32 < %s | FileCheck %s --check-prefix=WIN32
+; RUN: llc -disable-peephole -mtriple=x86_64-pc-win32 -mcpu=x86-64 < %s | FileCheck %s --check-prefixes=CHECK,WIN64
+; RUN: llc -disable-peephole -mtriple=i386-pc-win32 -mcpu=x86-64 < %s | FileCheck %s --check-prefix=WIN32
define {i64, i1} @t1() nounwind {
; CHECK-LABEL: t1:
@@ -129,11 +129,11 @@ define zeroext i1 @smuloi16(i16 %v1, i16 %v2, ptr %res) {
;
; WIN32-LABEL: smuloi16:
; WIN32: # %bb.0:
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
-; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %dx
+; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %cx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: seto %al
-; WIN32-NEXT: movw %dx, (%ecx)
+; WIN32-NEXT: movw %cx, (%edx)
; WIN32-NEXT: retl
%t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
%val = extractvalue {i16, i1} %t, 0
@@ -168,10 +168,10 @@ define zeroext i1 @smuloi32(i32 %v1, i32 %v2, ptr %res) {
; WIN32-LABEL: smuloi32:
; WIN32: # %bb.0:
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: seto %al
-; WIN32-NEXT: movl %edx, (%ecx)
+; WIN32-NEXT: movl %ecx, (%edx)
; WIN32-NEXT: retl
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
%val = extractvalue {i32, i1} %t, 0
@@ -219,9 +219,8 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: movl %edi, %esi
; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %eax, %ecx
+; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: addl %esi, %ecx
; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: sarl $31, %eax
@@ -231,6 +230,7 @@ define zeroext i1 @smuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: mull %ebx
; WIN32-NEXT: movl %edx, %esi
; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: addl %ebp, %ecx
; WIN32-NEXT: addl %eax, %esi
; WIN32-NEXT: addl %ebp, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
@@ -468,23 +468,23 @@ define zeroext i1 @umuloi64(i64 %v1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: testl %esi, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: testl %edi, %edi
; WIN32-NEXT: setne %dl
; WIN32-NEXT: testl %eax, %eax
; WIN32-NEXT: setne %cl
; WIN32-NEXT: andb %dl, %cl
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: seto %bl
-; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: seto %ch
; WIN32-NEXT: orb %bl, %ch
; WIN32-NEXT: orb %cl, %ch
-; WIN32-NEXT: leal (%edi,%eax), %esi
+; WIN32-NEXT: addl %eax, %esi
; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %esi, %edx
@@ -528,14 +528,11 @@ define i32 @smuloselecti32(i32 %v1, i32 %v2) {
;
; WIN32-LABEL: smuloselecti32:
; WIN32: # %bb.0:
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %eax, %edx
-; WIN32-NEXT: imull %ecx, %edx
-; WIN32-NEXT: jo LBB11_2
-; WIN32-NEXT: # %bb.1:
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: LBB11_2:
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl %ecx, %edx
+; WIN32-NEXT: imull %eax, %edx
+; WIN32-NEXT: cmovol %ecx, %eax
; WIN32-NEXT: retl
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
%obit = extractvalue {i32, i1} %t, 1
@@ -567,66 +564,63 @@ define i64 @smuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: sarl $31, %ecx
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: imull %ecx, %edi
-; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl %ebp, %esi
+; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %eax, %esi
-; WIN32-NEXT: addl %edi, %esi
-; WIN32-NEXT: movl %ebx, %eax
-; WIN32-NEXT: sarl $31, %eax
; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: addl %esi, %ebx
+; WIN32-NEXT: sarl $31, %ebp
+; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: imull %esi, %ecx
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: addl %ecx, %ebp
; WIN32-NEXT: addl %edi, %ebx
-; WIN32-NEXT: addl %eax, %ebx
-; WIN32-NEXT: addl %ecx, %eax
+; WIN32-NEXT: addl %eax, %ebp
+; WIN32-NEXT: addl %edi, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %esi, %ebx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl %edi, %eax
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: adcl %ebx, %ebp
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebx
+; WIN32-NEXT: movl %ebx, %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: mull %ecx
+; WIN32-NEXT: movl %edx, %edi
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: addl %esi, %ecx
-; WIN32-NEXT: adcl $0, %ebp
-; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: addl %edi, %ecx
+; WIN32-NEXT: adcl $0, %esi
+; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: movl %eax, %esi
-; WIN32-NEXT: addl %ecx, %esi
-; WIN32-NEXT: adcl %ebp, %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl %ecx, %ebx
+; WIN32-NEXT: adcl %esi, %edi
; WIN32-NEXT: setb %cl
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: addl %edi, %eax
; WIN32-NEXT: movzbl %cl, %ecx
; WIN32-NEXT: adcl %ecx, %edx
; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
-; WIN32-NEXT: adcl %ebx, %edx
-; WIN32-NEXT: sarl $31, %esi
-; WIN32-NEXT: xorl %esi, %edx
-; WIN32-NEXT: xorl %eax, %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: orl %edx, %esi
-; WIN32-NEXT: jne LBB12_2
-; WIN32-NEXT: # %bb.1:
+; WIN32-NEXT: adcl %ebp, %edx
+; WIN32-NEXT: sarl $31, %ebx
+; WIN32-NEXT: xorl %ebx, %edx
+; WIN32-NEXT: xorl %eax, %ebx
+; WIN32-NEXT: orl %edx, %ebx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: LBB12_2:
-; WIN32-NEXT: movl %ebp, %edx
+; WIN32-NEXT: cmovnel {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl %esi, %edx
+; WIN32-NEXT: cmovnel {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: addl $4, %esp
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
@@ -664,11 +658,8 @@ define i32 @umuloselecti32(i32 %v1, i32 %v2) {
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: mull %esi
-; WIN32-NEXT: jo LBB13_2
-; WIN32-NEXT: # %bb.1:
-; WIN32-NEXT: movl %esi, %ecx
-; WIN32-NEXT: LBB13_2:
-; WIN32-NEXT: movl %ecx, %eax
+; WIN32-NEXT: cmovol %ecx, %esi
+; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: popl %esi
; WIN32-NEXT: retl
%t = call {i32, i1} @llvm.umul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -701,42 +692,34 @@ define i64 @umuloselecti64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: pushl %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
-; WIN32-NEXT: testl %ebp, %ebp
-; WIN32-NEXT: setne %al
-; WIN32-NEXT: testl %esi, %esi
-; WIN32-NEXT: setne %bl
-; WIN32-NEXT: andb %al, %bl
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %edi
-; WIN32-NEXT: movl %edi, %edx
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: seto {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
-; WIN32-NEXT: movl %ebp, %eax
-; WIN32-NEXT: movl %edx, %ebp
-; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: seto %bh
-; WIN32-NEXT: orb {{[-0-9]+}}(%e{{[sb]}}p), %bh # 1-byte Folded Reload
-; WIN32-NEXT: orb %bl, %bh
-; WIN32-NEXT: addl %eax, %edi
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: addl %edi, %edx
+; WIN32-NEXT: testl %edi, %edi
+; WIN32-NEXT: setne %dl
+; WIN32-NEXT: testl %eax, %eax
+; WIN32-NEXT: setne %cl
+; WIN32-NEXT: andb %dl, %cl
+; WIN32-NEXT: mull %esi
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: seto %bl
+; WIN32-NEXT: movl %edi, %eax
+; WIN32-NEXT: mull {{[0-9]+}}(%esp)
+; WIN32-NEXT: seto %ch
+; WIN32-NEXT: orb %bl, %ch
+; WIN32-NEXT: leal (%eax,%ebp), %ebx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl %eax, %ebp
+; WIN32-NEXT: mull %esi
+; WIN32-NEXT: orb %cl, %ch
+; WIN32-NEXT: addl %ebx, %edx
; WIN32-NEXT: setb %al
-; WIN32-NEXT: orb %bh, %al
+; WIN32-NEXT: orb %ch, %al
; WIN32-NEXT: testb %al, %al
-; WIN32-NEXT: jne LBB14_2
-; WIN32-NEXT: # %bb.1:
-; WIN32-NEXT: movl %ebp, %ecx
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: LBB14_2:
-; WIN32-NEXT: movl %ecx, %eax
-; WIN32-NEXT: movl %esi, %edx
-; WIN32-NEXT: addl $4, %esp
+; WIN32-NEXT: cmovnel %ebp, %esi
+; WIN32-NEXT: cmovnel {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: movl %edi, %edx
; WIN32-NEXT: popl %esi
; WIN32-NEXT: popl %edi
; WIN32-NEXT: popl %ebx
@@ -993,9 +976,8 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: movl %edi, %esi
; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %eax, %ecx
+; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: addl %esi, %ecx
; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: sarl $31, %eax
@@ -1005,6 +987,7 @@ define zeroext i1 @smulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: movl %edx, %esi
; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: addl %ebx, %ecx
; WIN32-NEXT: addl %eax, %esi
; WIN32-NEXT: addl %ebx, %eax
; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
@@ -1305,28 +1288,28 @@ define zeroext i1 @umulobri64(i64 %v1, i64 %v2) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: testl %esi, %esi
-; WIN32-NEXT: setne %dl
-; WIN32-NEXT: testl %eax, %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edi
+; WIN32-NEXT: testl %edi, %edi
; WIN32-NEXT: setne %cl
-; WIN32-NEXT: andb %dl, %cl
+; WIN32-NEXT: testl %eax, %eax
+; WIN32-NEXT: setne %bl
+; WIN32-NEXT: andb %cl, %bl
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: movl %eax, %edi
-; WIN32-NEXT: seto %bl
-; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: movl %eax, %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
+; WIN32-NEXT: seto %bh
+; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: seto %ch
-; WIN32-NEXT: orb %bl, %ch
-; WIN32-NEXT: orb %cl, %ch
-; WIN32-NEXT: leal (%edi,%eax), %esi
+; WIN32-NEXT: seto %cl
+; WIN32-NEXT: orb %bh, %cl
+; WIN32-NEXT: orb %bl, %cl
+; WIN32-NEXT: addl %eax, %esi
; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %esi, %edx
; WIN32-NEXT: setb %al
-; WIN32-NEXT: orb %ch, %al
+; WIN32-NEXT: orb %cl, %al
; WIN32-NEXT: subb $1, %al
; WIN32-NEXT: je LBB22_1
; WIN32-NEXT: # %bb.3: # %continue
@@ -1515,12 +1498,12 @@ define zeroext i1 @smuloi16_load(ptr %ptr1, i16 %v2, ptr %res) {
;
; WIN32-LABEL: smuloi16_load:
; WIN32: # %bb.0:
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movzwl (%eax), %edx
-; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %dx
+; WIN32-NEXT: movzwl (%eax), %ecx
+; WIN32-NEXT: imulw {{[0-9]+}}(%esp), %cx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: seto %al
-; WIN32-NEXT: movw %dx, (%ecx)
+; WIN32-NEXT: movw %cx, (%edx)
; WIN32-NEXT: retl
%v1 = load i16, ptr %ptr1
%t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
@@ -1555,12 +1538,12 @@ define zeroext i1 @smuloi16_load2(i16 %v1, ptr %ptr2, ptr %res) {
;
; WIN32-LABEL: smuloi16_load2:
; WIN32: # %bb.0:
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: imulw (%eax), %dx
+; WIN32-NEXT: movzwl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: imulw (%eax), %cx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: seto %al
-; WIN32-NEXT: movw %dx, (%ecx)
+; WIN32-NEXT: movw %cx, (%edx)
; WIN32-NEXT: retl
%v2 = load i16, ptr %ptr2
%t = call {i16, i1} @llvm.smul.with.overflow.i16(i16 %v1, i16 %v2)
@@ -1595,12 +1578,12 @@ define zeroext i1 @smuloi32_load(ptr %ptr1, i32 %v2, ptr %res) {
;
; WIN32-LABEL: smuloi32_load:
; WIN32: # %bb.0:
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl (%eax), %edx
-; WIN32-NEXT: imull {{[0-9]+}}(%esp), %edx
+; WIN32-NEXT: movl (%eax), %ecx
+; WIN32-NEXT: imull {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
; WIN32-NEXT: seto %al
-; WIN32-NEXT: movl %edx, (%ecx)
+; WIN32-NEXT: movl %ecx, (%edx)
; WIN32-NEXT: retl
%v1 = load i32, ptr %ptr1
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -1635,12 +1618,12 @@ define zeroext i1 @smuloi32_load2(i32 %v1, ptr %ptr2, ptr %res) {
;
; WIN32-LABEL: smuloi32_load2:
; WIN32: # %bb.0:
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
+; WIN32-NEXT: imull (%eax), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %edx
-; WIN32-NEXT: imull (%eax), %edx
; WIN32-NEXT: seto %al
-; WIN32-NEXT: movl %edx, (%ecx)
+; WIN32-NEXT: movl %ecx, (%edx)
; WIN32-NEXT: retl
%v2 = load i32, ptr %ptr2
%t = call {i32, i1} @llvm.smul.with.overflow.i32(i32 %v1, i32 %v2)
@@ -1683,65 +1666,67 @@ define zeroext i1 @smuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl (%eax), %ebx
-; WIN32-NEXT: movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl 4(%eax), %ebp
+; WIN32-NEXT: movl %ebx, (%esp) # 4-byte Spill
+; WIN32-NEXT: movl 4(%eax), %edx
; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: movl %ecx, %edi
; WIN32-NEXT: sarl $31, %eax
; WIN32-NEXT: movl %eax, %ecx
-; WIN32-NEXT: imull %ebp, %ecx
+; WIN32-NEXT: imull %edx, %ecx
+; WIN32-NEXT: movl %edx, %esi
; WIN32-NEXT: mull %ebx
-; WIN32-NEXT: movl %eax, (%esp) # 4-byte Spill
-; WIN32-NEXT: movl %edx, %ebx
-; WIN32-NEXT: addl %ecx, %ebx
-; WIN32-NEXT: movl %ebp, %ecx
-; WIN32-NEXT: movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: movl %edx, %ebp
+; WIN32-NEXT: addl %ecx, %ebp
+; WIN32-NEXT: movl %esi, %ecx
+; WIN32-NEXT: movl %esi, %ebx
+; WIN32-NEXT: movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: movl %edi, %esi
; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull %ecx
; WIN32-NEXT: movl %edx, %edi
-; WIN32-NEXT: addl %eax, %edi
; WIN32-NEXT: addl %esi, %edi
-; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
-; WIN32-NEXT: addl %ecx, %ebx
-; WIN32-NEXT: addl %eax, %ecx
-; WIN32-NEXT: movl %ecx, (%esp) # 4-byte Spill
-; WIN32-NEXT: adcl %ebx, %edi
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; WIN32-NEXT: addl %ecx, %ebp
+; WIN32-NEXT: addl %eax, %edi
+; WIN32-NEXT: addl %eax, %ecx
+; WIN32-NEXT: movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; WIN32-NEXT: adcl %ebp, %edi
+; WIN32-NEXT: movl (%esp), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
; WIN32-NEXT: mull %esi
; WIN32-NEXT: movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: movl %ebx, %eax
; WIN32-NEXT: mull %esi
-; WIN32-NEXT: movl %edx, %ebx
+; WIN32-NEXT: movl %edx, %ebp
; WIN32-NEXT: movl %eax, %esi
; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; WIN32-NEXT: adcl $0, %ebx
+; WIN32-NEXT: adcl $0, %ebp
; WIN32-NEXT: movl %ecx, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %ecx
-; WIN32-NEXT: movl %eax, %ebp
-; WIN32-NEXT: addl %esi, %ebp
-; WIN32-NEXT: adcl %ebx, %ecx
-; WIN32-NEXT: setb %bl
+; WIN32-NEXT: movl %eax, %ebx
+; WIN32-NEXT: addl %esi, %ebx
+; WIN32-NEXT: adcl %ebp, %ecx
+; WIN32-NEXT: setb (%esp) # 1-byte Folded Spill
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: addl %ecx, %eax
-; WIN32-NEXT: movzbl %bl, %ecx
+; WIN32-NEXT: movzbl (%esp), %ecx # 1-byte Folded Reload
; WIN32-NEXT: adcl %ecx, %edx
-; WIN32-NEXT: addl (%esp), %eax # 4-byte Folded Reload
+; WIN32-NEXT: addl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
; WIN32-NEXT: adcl %edi, %edx
-; WIN32-NEXT: movl %ebp, %ecx
+; WIN32-NEXT: movl %ebx, %ecx
; WIN32-NEXT: sarl $31, %ecx
; WIN32-NEXT: xorl %ecx, %edx
; WIN32-NEXT: xorl %eax, %ecx
; WIN32-NEXT: orl %edx, %ecx
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl %ebp, 4(%eax)
+; WIN32-NEXT: movl %ebx, 4(%eax)
; WIN32-NEXT: movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
; WIN32-NEXT: movl %ecx, (%eax)
; WIN32-NEXT: setne %al
@@ -1800,9 +1785,8 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
; WIN32-NEXT: imull %ecx, %esi
; WIN32-NEXT: movl %ebp, %eax
; WIN32-NEXT: mull %ecx
-; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: movl %eax, %ebx
-; WIN32-NEXT: addl %eax, %ecx
+; WIN32-NEXT: movl %edx, %ecx
; WIN32-NEXT: addl %esi, %ecx
; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: sarl $31, %eax
@@ -1811,6 +1795,7 @@ define zeroext i1 @smuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %edx, %esi
; WIN32-NEXT: addl %edi, %esi
+; WIN32-NEXT: addl %ebx, %ecx
; WIN32-NEXT: addl %eax, %esi
; WIN32-NEXT: addl %ebx, %eax
; WIN32-NEXT: movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -2204,27 +2189,27 @@ define zeroext i1 @umuloi64_load(ptr %ptr1, i64 %v2, ptr %res) {
; WIN32-NEXT: pushl %ebx
; WIN32-NEXT: pushl %edi
; WIN32-NEXT: pushl %esi
+; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ebp
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
-; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
-; WIN32-NEXT: movl (%eax), %ebp
-; WIN32-NEXT: movl 4(%eax), %eax
-; WIN32-NEXT: testl %esi, %esi
+; WIN32-NEXT: movl 4(%esi), %eax
+; WIN32-NEXT: testl %ebp, %ebp
; WIN32-NEXT: setne %dl
; WIN32-NEXT: testl %eax, %eax
; WIN32-NEXT: setne %cl
; WIN32-NEXT: andb %dl, %cl
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: movl (%esi), %esi
; WIN32-NEXT: seto %bl
-; WIN32-NEXT: movl %esi, %eax
-; WIN32-NEXT: mull %ebp
+; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: mull %esi
; WIN32-NEXT: seto %ch
; WIN32-NEXT: orb %bl, %ch
; WIN32-NEXT: orb %cl, %ch
-; WIN32-NEXT: leal (%edi,%eax), %esi
-; WIN32-NEXT: movl %ebp, %eax
+; WIN32-NEXT: addl %eax, %edi
+; WIN32-NEXT: movl %esi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
-; WIN32-NEXT: addl %esi, %edx
+; WIN32-NEXT: addl %edi, %edx
; WIN32-NEXT: setb %cl
; WIN32-NEXT: orb %ch, %cl
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %esi
@@ -2284,21 +2269,21 @@ define zeroext i1 @umuloi64_load2(i64 %v1, ptr %ptr2, ptr %res) {
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %ecx
; WIN32-NEXT: movl (%ecx), %ebp
-; WIN32-NEXT: movl 4(%ecx), %esi
+; WIN32-NEXT: movl 4(%ecx), %edi
; WIN32-NEXT: testl %eax, %eax
; WIN32-NEXT: setne %dl
-; WIN32-NEXT: testl %esi, %esi
+; WIN32-NEXT: testl %edi, %edi
; WIN32-NEXT: setne %cl
; WIN32-NEXT: andb %dl, %cl
; WIN32-NEXT: mull %ebp
-; WIN32-NEXT: movl %eax, %edi
+; WIN32-NEXT: movl %eax, %esi
; WIN32-NEXT: seto %bl
-; WIN32-NEXT: movl %esi, %eax
+; WIN32-NEXT: movl %edi, %eax
; WIN32-NEXT: mull {{[0-9]+}}(%esp)
; WIN32-NEXT: seto %ch
; WIN32-NEXT: orb %bl, %ch
; WIN32-NEXT: orb %cl, %ch
-; WIN32-NEXT: leal (%edi,%eax), %esi
+; WIN32-NEXT: addl %eax, %esi
; WIN32-NEXT: movl {{[0-9]+}}(%esp), %eax
; WIN32-NEXT: mull %ebp
; WIN32-NEXT: addl %esi, %edx
More information about the llvm-commits
mailing list