[llvm] [X86] Remove single use assumption in combineVectorSizedSetCCEquality (PR #182200)
Gergo Stomfai via llvm-commits
llvm-commits at lists.llvm.org
Fri Mar 13 03:51:01 PDT 2026
https://github.com/stomfaig updated https://github.com/llvm/llvm-project/pull/182200
>From 1abcc7d8bd5f0ee89ee1d05ad7d6975e1b9969a6 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 00:23:56 +0000
Subject: [PATCH 1/8] add check for all parents of load for foldability
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 36 +++++++++---
llvm/test/CodeGen/X86/bittest-big-integer.ll | 59 +++++++-------------
llvm/test/CodeGen/X86/pr166534.ll | 47 +++++++++-------
3 files changed, 77 insertions(+), 65 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 927a49b203968..aefeb4bf38912 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2959,14 +2959,36 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
case ISD::SUB:
case ISD::FSHL:
case ISD::FSHR:
- return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
- mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
+ return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget, AssumeSingleUse) &&
+ mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse);
case ISD::SELECT:
- return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget) &&
- mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget);
+ return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse) &&
+ mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget, AssumeSingleUse);
}
}
- return X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse,
+ if (!ISD::isNormalLoad(Op.getNode()))
+ return false;
+
+ // Single-use loads just check the load itself
+ if (AssumeSingleUse || Op.hasOneUse())
+ return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
+ /*IgnoreAlignment=*/true);
+
+ for (SDUse &Use : Op->uses()) {
+ if (Use.getResNo() != 0)
+ continue;
+
+ SDNode *User = Use.getUser();
+ if (ISD::isNormalStore(User))
+ continue;
+
+ if (!mayFoldIntoVector(SDValue(User, 0), DAG, Subtarget,
+ /*AssumeSingleUse=*/true))
+ return false;
+ }
+
+ // All users are vectorizable, now check the load itself
+ return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/false,
/*IgnoreAlignment=*/true);
}
@@ -23540,8 +23562,8 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
// Don't perform this combine if constructing the vector will be expensive.
// TODO: Drop AssumeSingleUse = true override.
- if ((!mayFoldIntoVector(X, DAG, Subtarget, /*AssumeSingleUse=*/true) ||
- !mayFoldIntoVector(Y, DAG, Subtarget, /*AssumeSingleUse=*/true)) &&
+ if ((!mayFoldIntoVector(X, DAG, Subtarget) ||
+ !mayFoldIntoVector(Y, DAG, Subtarget)) &&
!IsOrXorXorTreeCCZero)
return SDValue();
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 7070848e3fe3e..4980bc89ae74a 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1199,45 +1199,26 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: chain_reset_i256:
-; SSE: # %bb.0:
-; SSE-NEXT: # kill: def $ecx killed $ecx def $rcx
-; SSE-NEXT: movl $-2, %eax
-; SSE-NEXT: roll %cl, %eax
-; SSE-NEXT: shrl $3, %ecx
-; SSE-NEXT: andl $28, %ecx
-; SSE-NEXT: andl %eax, (%rdi,%rcx)
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %r8
-; SSE-NEXT: orq 24(%rdi), %r8
-; SSE-NEXT: movq 16(%rdi), %rdi
-; SSE-NEXT: orq %rcx, %rdi
-; SSE-NEXT: movl (%rsi), %eax
-; SSE-NEXT: movl %ecx, (%rsi)
-; SSE-NEXT: movl (%rdx), %ecx
-; SSE-NEXT: addl %ecx, %eax
-; SSE-NEXT: orq %r8, %rdi
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: retq
-;
-; AVX-LABEL: chain_reset_i256:
-; AVX: # %bb.0:
-; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx
-; AVX-NEXT: movl $-2, %eax
-; AVX-NEXT: roll %cl, %eax
-; AVX-NEXT: shrl $3, %ecx
-; AVX-NEXT: andl $28, %ecx
-; AVX-NEXT: andl %eax, (%rdi,%rcx)
-; AVX-NEXT: vmovdqu (%rdi), %ymm0
-; AVX-NEXT: movl (%rdi), %ecx
-; AVX-NEXT: movl (%rsi), %eax
-; AVX-NEXT: movl %ecx, (%rsi)
-; AVX-NEXT: movl (%rdx), %ecx
-; AVX-NEXT: addl %ecx, %eax
-; AVX-NEXT: vptest %ymm0, %ymm0
-; AVX-NEXT: cmovnel %ecx, %eax
-; AVX-NEXT: vzeroupper
-; AVX-NEXT: retq
+; X64-LABEL: chain_reset_i256:
+; X64: # %bb.0:
+; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT: movl $-2, %eax
+; X64-NEXT: roll %cl, %eax
+; X64-NEXT: shrl $3, %ecx
+; X64-NEXT: andl $28, %ecx
+; X64-NEXT: andl %eax, (%rdi,%rcx)
+; X64-NEXT: movq (%rdi), %rcx
+; X64-NEXT: movq 8(%rdi), %r8
+; X64-NEXT: orq 24(%rdi), %r8
+; X64-NEXT: movq 16(%rdi), %rdi
+; X64-NEXT: orq %rcx, %rdi
+; X64-NEXT: movl (%rsi), %eax
+; X64-NEXT: movl %ecx, (%rsi)
+; X64-NEXT: movl (%rdx), %ecx
+; X64-NEXT: addl %ecx, %eax
+; X64-NEXT: orq %r8, %rdi
+; X64-NEXT: cmovnel %ecx, %eax
+; X64-NEXT: retq
%rem = and i32 %position, 255
%ofs = zext nneg i32 %rem to i256
%bit = shl nuw i256 1, %ofs
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
index 162a0c93bfcf4..fc31fe9ce4bd6 100644
--- a/llvm/test/CodeGen/X86/pr166534.ll
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -7,15 +7,16 @@
define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
; SSE2-LABEL: pr166534:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movdqu (%rdi), %xmm0
-; SSE2-NEXT: movdqu (%rsi), %xmm1
-; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
-; SSE2-NEXT: pmovmskb %xmm1, %esi
+; SSE2-NEXT: movq (%rdi), %r8
+; SSE2-NEXT: movq 8(%rdi), %rdi
+; SSE2-NEXT: xorq 8(%rsi), %rdi
+; SSE2-NEXT: xorq (%rsi), %r8
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: movq %r8, %rsi
+; SSE2-NEXT: orq %rdi, %rsi
; SSE2-NEXT: sete %al
; SSE2-NEXT: orq %rax, (%rdx)
-; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT: orq %rdi, %r8
; SSE2-NEXT: jne .LBB0_2
; SSE2-NEXT: # %bb.1: # %if.then
; SSE2-NEXT: orq %rax, (%rcx)
@@ -24,14 +25,16 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
;
; SSE4-LABEL: pr166534:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movdqu (%rdi), %xmm0
-; SSE4-NEXT: movdqu (%rsi), %xmm1
-; SSE4-NEXT: pxor %xmm0, %xmm1
+; SSE4-NEXT: movq (%rdi), %r8
+; SSE4-NEXT: movq 8(%rdi), %rdi
+; SSE4-NEXT: xorq 8(%rsi), %rdi
+; SSE4-NEXT: xorq (%rsi), %r8
; SSE4-NEXT: xorl %eax, %eax
-; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: movq %r8, %rsi
+; SSE4-NEXT: orq %rdi, %rsi
; SSE4-NEXT: sete %al
; SSE4-NEXT: orq %rax, (%rdx)
-; SSE4-NEXT: ptest %xmm1, %xmm1
+; SSE4-NEXT: orq %rdi, %r8
; SSE4-NEXT: jne .LBB0_2
; SSE4-NEXT: # %bb.1: # %if.then
; SSE4-NEXT: orq %rax, (%rcx)
@@ -40,13 +43,16 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
;
; AVX2-LABEL: pr166534:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: vmovdqu (%rdi), %xmm0
-; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX2-NEXT: movq (%rdi), %r8
+; AVX2-NEXT: movq 8(%rdi), %rdi
+; AVX2-NEXT: xorq 8(%rsi), %rdi
+; AVX2-NEXT: xorq (%rsi), %r8
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: movq %r8, %rsi
+; AVX2-NEXT: orq %rdi, %rsi
; AVX2-NEXT: sete %al
; AVX2-NEXT: orq %rax, (%rdx)
-; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: orq %rdi, %r8
; AVX2-NEXT: jne .LBB0_2
; AVX2-NEXT: # %bb.1: # %if.then
; AVX2-NEXT: orq %rax, (%rcx)
@@ -55,13 +61,16 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
;
; AVX512-LABEL: pr166534:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: vmovdqu (%rdi), %xmm0
-; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0
+; AVX512-NEXT: movq (%rdi), %r8
+; AVX512-NEXT: movq 8(%rdi), %rdi
+; AVX512-NEXT: xorq 8(%rsi), %rdi
+; AVX512-NEXT: xorq (%rsi), %r8
; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: movq %r8, %rsi
+; AVX512-NEXT: orq %rdi, %rsi
; AVX512-NEXT: sete %al
; AVX512-NEXT: orq %rax, (%rdx)
-; AVX512-NEXT: vptest %xmm0, %xmm0
+; AVX512-NEXT: orq %rdi, %r8
; AVX512-NEXT: jne .LBB0_2
; AVX512-NEXT: # %bb.1: # %if.then
; AVX512-NEXT: orq %rax, (%rcx)
>From 3fd4eea3881a34c11eff7b2f3cbd5046055a7011 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 00:26:12 +0000
Subject: [PATCH 2/8] format
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++++----
1 file changed, 8 insertions(+), 4 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aefeb4bf38912..8c23ceabd9894 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2959,11 +2959,15 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
case ISD::SUB:
case ISD::FSHL:
case ISD::FSHR:
- return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget, AssumeSingleUse) &&
- mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse);
+ return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget,
+ AssumeSingleUse) &&
+ mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
+ AssumeSingleUse);
case ISD::SELECT:
- return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse) &&
- mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget, AssumeSingleUse);
+ return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
+ AssumeSingleUse) &&
+ mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget,
+ AssumeSingleUse);
}
}
if (!ISD::isNormalLoad(Op.getNode()))
>From be09704213d653a1020bdff6db3cb2e10cee4ed9 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 00:41:24 +0000
Subject: [PATCH 3/8] remove todo
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 1 -
1 file changed, 1 deletion(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8c23ceabd9894..dcb42f0576fba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23565,7 +23565,6 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
return SDValue();
// Don't perform this combine if constructing the vector will be expensive.
- // TODO: Drop AssumeSingleUse = true override.
if ((!mayFoldIntoVector(X, DAG, Subtarget) ||
!mayFoldIntoVector(Y, DAG, Subtarget)) &&
!IsOrXorXorTreeCCZero)
>From 951da19e959cf464f53595e61b348a0640fc7bc2 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 21:36:23 +0000
Subject: [PATCH 4/8] add backward handling for setcc, truncate and vector
types
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 30 +++++++-
llvm/test/CodeGen/X86/bittest-big-integer.ll | 80 +++++++++++++++-----
llvm/test/CodeGen/X86/pr166534.ll | 47 +++++-------
llvm/test/CodeGen/X86/pr173924.ll | 41 +++++-----
4 files changed, 127 insertions(+), 71 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dcb42f0576fba..71cb5edf385a4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2940,6 +2940,7 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
return true;
if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
return true;
+
EVT VT = Op.getValueType();
unsigned Opcode = Op.getOpcode();
if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
@@ -2970,6 +2971,7 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
AssumeSingleUse);
}
}
+
if (!ISD::isNormalLoad(Op.getNode()))
return false;
@@ -2986,13 +2988,35 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
if (ISD::isNormalStore(User))
continue;
- if (!mayFoldIntoVector(SDValue(User, 0), DAG, Subtarget,
+ if (User->getOpcode() == ISD::SETCC) {
+ ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
+ if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+ if (mayFoldIntoVector(User->getOperand(0), DAG, Subtarget,
+ /*AssumeSingleUse=*/true) &&
+ mayFoldIntoVector(User->getOperand(1), DAG, Subtarget,
+ /*AssumeSingleUse=*/true))
+ continue;
+ }
+ return false;
+ }
+
+ if (User->getOpcode() == ISD::TRUNCATE)
+ continue;
+
+ SDValue Value = SDValue(User, 0);
+
+ if (isa<ConstantSDNode>(Value) || isa<ConstantFPSDNode>(Value))
+ continue;
+
+ if (Value.getValueType().isVector())
+ continue;
+
+ if (!mayFoldIntoVector(Value, DAG, Subtarget,
/*AssumeSingleUse=*/true))
return false;
}
- // All users are vectorizable, now check the load itself
- return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/false,
+ return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
/*IgnoreAlignment=*/true);
}
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 4980bc89ae74a..c880f39081baf 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1199,26 +1199,66 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; X64-LABEL: chain_reset_i256:
-; X64: # %bb.0:
-; X64-NEXT: # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT: movl $-2, %eax
-; X64-NEXT: roll %cl, %eax
-; X64-NEXT: shrl $3, %ecx
-; X64-NEXT: andl $28, %ecx
-; X64-NEXT: andl %eax, (%rdi,%rcx)
-; X64-NEXT: movq (%rdi), %rcx
-; X64-NEXT: movq 8(%rdi), %r8
-; X64-NEXT: orq 24(%rdi), %r8
-; X64-NEXT: movq 16(%rdi), %rdi
-; X64-NEXT: orq %rcx, %rdi
-; X64-NEXT: movl (%rsi), %eax
-; X64-NEXT: movl %ecx, (%rsi)
-; X64-NEXT: movl (%rdx), %ecx
-; X64-NEXT: addl %ecx, %eax
-; X64-NEXT: orq %r8, %rdi
-; X64-NEXT: cmovnel %ecx, %eax
-; X64-NEXT: retq
+; SSE2-LABEL: chain_reset_i256:
+; SSE2: # %bb.0:
+; SSE2-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSE2-NEXT: movl $-2, %eax
+; SSE2-NEXT: roll %cl, %eax
+; SSE2-NEXT: shrl $3, %ecx
+; SSE2-NEXT: andl $28, %ecx
+; SSE2-NEXT: andl %eax, (%rdi,%rcx)
+; SSE2-NEXT: movl (%rdi), %ecx
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: por 16(%rdi), %xmm0
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %edi
+; SSE2-NEXT: xorl $15, %edi
+; SSE2-NEXT: movl (%rsi), %eax
+; SSE2-NEXT: movl %ecx, (%rsi)
+; SSE2-NEXT: movl (%rdx), %ecx
+; SSE2-NEXT: addl %ecx, %eax
+; SSE2-NEXT: testl %edi, %edi
+; SSE2-NEXT: cmovnel %ecx, %eax
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: chain_reset_i256:
+; SSE4: # %bb.0:
+; SSE4-NEXT: # kill: def $ecx killed $ecx def $rcx
+; SSE4-NEXT: movl $-2, %eax
+; SSE4-NEXT: roll %cl, %eax
+; SSE4-NEXT: shrl $3, %ecx
+; SSE4-NEXT: andl $28, %ecx
+; SSE4-NEXT: andl %eax, (%rdi,%rcx)
+; SSE4-NEXT: movl (%rdi), %ecx
+; SSE4-NEXT: movdqa (%rdi), %xmm0
+; SSE4-NEXT: por 16(%rdi), %xmm0
+; SSE4-NEXT: movl (%rsi), %eax
+; SSE4-NEXT: movl %ecx, (%rsi)
+; SSE4-NEXT: movl (%rdx), %ecx
+; SSE4-NEXT: addl %ecx, %eax
+; SSE4-NEXT: ptest %xmm0, %xmm0
+; SSE4-NEXT: cmovnel %ecx, %eax
+; SSE4-NEXT: retq
+;
+; AVX-LABEL: chain_reset_i256:
+; AVX: # %bb.0:
+; AVX-NEXT: # kill: def $ecx killed $ecx def $rcx
+; AVX-NEXT: movl $-2, %eax
+; AVX-NEXT: roll %cl, %eax
+; AVX-NEXT: shrl $3, %ecx
+; AVX-NEXT: andl $28, %ecx
+; AVX-NEXT: andl %eax, (%rdi,%rcx)
+; AVX-NEXT: vmovdqu (%rdi), %ymm0
+; AVX-NEXT: movl (%rdi), %ecx
+; AVX-NEXT: movl (%rsi), %eax
+; AVX-NEXT: movl %ecx, (%rsi)
+; AVX-NEXT: movl (%rdx), %ecx
+; AVX-NEXT: addl %ecx, %eax
+; AVX-NEXT: vptest %ymm0, %ymm0
+; AVX-NEXT: cmovnel %ecx, %eax
+; AVX-NEXT: vzeroupper
+; AVX-NEXT: retq
%rem = and i32 %position, 255
%ofs = zext nneg i32 %rem to i256
%bit = shl nuw i256 1, %ofs
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
index fc31fe9ce4bd6..162a0c93bfcf4 100644
--- a/llvm/test/CodeGen/X86/pr166534.ll
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -7,16 +7,15 @@
define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
; SSE2-LABEL: pr166534:
; SSE2: # %bb.0: # %entry
-; SSE2-NEXT: movq (%rdi), %r8
-; SSE2-NEXT: movq 8(%rdi), %rdi
-; SSE2-NEXT: xorq 8(%rsi), %rdi
-; SSE2-NEXT: xorq (%rsi), %r8
+; SSE2-NEXT: movdqu (%rdi), %xmm0
+; SSE2-NEXT: movdqu (%rsi), %xmm1
+; SSE2-NEXT: pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT: pmovmskb %xmm1, %esi
; SSE2-NEXT: xorl %eax, %eax
-; SSE2-NEXT: movq %r8, %rsi
-; SSE2-NEXT: orq %rdi, %rsi
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
; SSE2-NEXT: sete %al
; SSE2-NEXT: orq %rax, (%rdx)
-; SSE2-NEXT: orq %rdi, %r8
+; SSE2-NEXT: cmpl $65535, %esi # imm = 0xFFFF
; SSE2-NEXT: jne .LBB0_2
; SSE2-NEXT: # %bb.1: # %if.then
; SSE2-NEXT: orq %rax, (%rcx)
@@ -25,16 +24,14 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
;
; SSE4-LABEL: pr166534:
; SSE4: # %bb.0: # %entry
-; SSE4-NEXT: movq (%rdi), %r8
-; SSE4-NEXT: movq 8(%rdi), %rdi
-; SSE4-NEXT: xorq 8(%rsi), %rdi
-; SSE4-NEXT: xorq (%rsi), %r8
+; SSE4-NEXT: movdqu (%rdi), %xmm0
+; SSE4-NEXT: movdqu (%rsi), %xmm1
+; SSE4-NEXT: pxor %xmm0, %xmm1
; SSE4-NEXT: xorl %eax, %eax
-; SSE4-NEXT: movq %r8, %rsi
-; SSE4-NEXT: orq %rdi, %rsi
+; SSE4-NEXT: ptest %xmm1, %xmm1
; SSE4-NEXT: sete %al
; SSE4-NEXT: orq %rax, (%rdx)
-; SSE4-NEXT: orq %rdi, %r8
+; SSE4-NEXT: ptest %xmm1, %xmm1
; SSE4-NEXT: jne .LBB0_2
; SSE4-NEXT: # %bb.1: # %if.then
; SSE4-NEXT: orq %rax, (%rcx)
@@ -43,16 +40,13 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
;
; AVX2-LABEL: pr166534:
; AVX2: # %bb.0: # %entry
-; AVX2-NEXT: movq (%rdi), %r8
-; AVX2-NEXT: movq 8(%rdi), %rdi
-; AVX2-NEXT: xorq 8(%rsi), %rdi
-; AVX2-NEXT: xorq (%rsi), %r8
+; AVX2-NEXT: vmovdqu (%rdi), %xmm0
+; AVX2-NEXT: vpxor (%rsi), %xmm0, %xmm0
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: movq %r8, %rsi
-; AVX2-NEXT: orq %rdi, %rsi
+; AVX2-NEXT: vptest %xmm0, %xmm0
; AVX2-NEXT: sete %al
; AVX2-NEXT: orq %rax, (%rdx)
-; AVX2-NEXT: orq %rdi, %r8
+; AVX2-NEXT: vptest %xmm0, %xmm0
; AVX2-NEXT: jne .LBB0_2
; AVX2-NEXT: # %bb.1: # %if.then
; AVX2-NEXT: orq %rax, (%rcx)
@@ -61,16 +55,13 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
;
; AVX512-LABEL: pr166534:
; AVX512: # %bb.0: # %entry
-; AVX512-NEXT: movq (%rdi), %r8
-; AVX512-NEXT: movq 8(%rdi), %rdi
-; AVX512-NEXT: xorq 8(%rsi), %rdi
-; AVX512-NEXT: xorq (%rsi), %r8
+; AVX512-NEXT: vmovdqu (%rdi), %xmm0
+; AVX512-NEXT: vpxor (%rsi), %xmm0, %xmm0
; AVX512-NEXT: xorl %eax, %eax
-; AVX512-NEXT: movq %r8, %rsi
-; AVX512-NEXT: orq %rdi, %rsi
+; AVX512-NEXT: vptest %xmm0, %xmm0
; AVX512-NEXT: sete %al
; AVX512-NEXT: orq %rax, (%rdx)
-; AVX512-NEXT: orq %rdi, %r8
+; AVX512-NEXT: vptest %xmm0, %xmm0
; AVX512-NEXT: jne .LBB0_2
; AVX512-NEXT: # %bb.1: # %if.then
; AVX512-NEXT: orq %rax, (%rcx)
diff --git a/llvm/test/CodeGen/X86/pr173924.ll b/llvm/test/CodeGen/X86/pr173924.ll
index d130014a8fa62..17c048c05a7de 100644
--- a/llvm/test/CodeGen/X86/pr173924.ll
+++ b/llvm/test/CodeGen/X86/pr173924.ll
@@ -6,29 +6,30 @@ define i256 @PR173924(<8 x i256> %a0) {
; CHECK-LABEL: PR173924:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT: vmovdqu {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %edx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r8d
-; CHECK-NEXT: movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
; CHECK-NEXT: andl $1, %r10d
-; CHECK-NEXT: andl $1, %esi
-; CHECK-NEXT: addl %r10d, %esi
-; CHECK-NEXT: andl $1, %r8d
-; CHECK-NEXT: andl $1, %ecx
-; CHECK-NEXT: addl %r8d, %ecx
-; CHECK-NEXT: addl %esi, %ecx
+; CHECK-NEXT: andl $1, %r9d
+; CHECK-NEXT: addq %r10, %r9
+; CHECK-NEXT: vmovd {{.*#+}} xmm0 = [1,0,0,0]
+; CHECK-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; CHECK-NEXT: vmovq %xmm1, %r10
; CHECK-NEXT: andl $1, %edx
-; CHECK-NEXT: vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT: vmovq %xmm0, %rsi
+; CHECK-NEXT: addq %r10, %rdx
+; CHECK-NEXT: addq %r9, %rdx
+; CHECK-NEXT: andl $1, %r8d
+; CHECK-NEXT: andl $1, %esi
+; CHECK-NEXT: addq %r8, %rsi
; CHECK-NEXT: andl $1, %edi
-; CHECK-NEXT: andl $1, %r9d
-; CHECK-NEXT: addl %edi, %r9d
-; CHECK-NEXT: addl %edx, %esi
-; CHECK-NEXT: addl %r9d, %esi
-; CHECK-NEXT: addl %ecx, %esi
-; CHECK-NEXT: vmovd %esi, %xmm0
+; CHECK-NEXT: andl $1, %ecx
+; CHECK-NEXT: addq %rdi, %rcx
+; CHECK-NEXT: addq %rsi, %rcx
+; CHECK-NEXT: addq %rdx, %rcx
+; CHECK-NEXT: vmovq %rcx, %xmm1
+; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
; CHECK-NEXT: vmovdqu %ymm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
>From 97d39e5cbec2299757b84b449678d9a5cdfe450a Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Tue, 10 Mar 2026 10:50:09 +0000
Subject: [PATCH 5/8] remove setcc traversal in load check
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 9 ++-------
1 file changed, 2 insertions(+), 7 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 71cb5edf385a4..206b54c519319 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2990,13 +2990,8 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
if (User->getOpcode() == ISD::SETCC) {
ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
- if (CC == ISD::SETEQ || CC == ISD::SETNE) {
- if (mayFoldIntoVector(User->getOperand(0), DAG, Subtarget,
- /*AssumeSingleUse=*/true) &&
- mayFoldIntoVector(User->getOperand(1), DAG, Subtarget,
- /*AssumeSingleUse=*/true))
- continue;
- }
+ if (CC == ISD::SETEQ || CC == ISD::SETNE)
+ continue;
return false;
}
>From 8c9c50d4fa6026abc83e5ba09d2bd057a3186a57 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 12 Mar 2026 09:00:39 +0000
Subject: [PATCH 6/8] remove load user checks
---
llvm/lib/Target/X86/X86ISelLowering.cpp | 55 +----
llvm/test/CodeGen/X86/bittest-big-integer.ll | 235 +++++++++++--------
llvm/test/CodeGen/X86/setcc-wide-types.ll | 167 ++++++++-----
llvm/test/CodeGen/X86/urem-seteq.ll | 3 +-
4 files changed, 251 insertions(+), 209 deletions(-)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 206b54c519319..8906f16a8dd17 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2934,8 +2934,7 @@ bool X86::mayFoldIntoZeroExtend(SDValue Op) {
// Return true if its cheap to bitcast this to a vector type.
static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
- const X86Subtarget &Subtarget,
- bool AssumeSingleUse = false) {
+ const X86Subtarget &Subtarget) {
if (peekThroughBitcasts(Op).getValueType().isVector())
return true;
if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
@@ -2960,57 +2959,13 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
case ISD::SUB:
case ISD::FSHL:
case ISD::FSHR:
- return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget,
- AssumeSingleUse) &&
- mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
- AssumeSingleUse);
+ return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
+ mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
case ISD::SELECT:
- return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
- AssumeSingleUse) &&
- mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget,
- AssumeSingleUse);
+ return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget) &&
+ mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget);
}
}
-
- if (!ISD::isNormalLoad(Op.getNode()))
- return false;
-
- // Single-use loads just check the load itself
- if (AssumeSingleUse || Op.hasOneUse())
- return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
- /*IgnoreAlignment=*/true);
-
- for (SDUse &Use : Op->uses()) {
- if (Use.getResNo() != 0)
- continue;
-
- SDNode *User = Use.getUser();
- if (ISD::isNormalStore(User))
- continue;
-
- if (User->getOpcode() == ISD::SETCC) {
- ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
- if (CC == ISD::SETEQ || CC == ISD::SETNE)
- continue;
- return false;
- }
-
- if (User->getOpcode() == ISD::TRUNCATE)
- continue;
-
- SDValue Value = SDValue(User, 0);
-
- if (isa<ConstantSDNode>(Value) || isa<ConstantFPSDNode>(Value))
- continue;
-
- if (Value.getValueType().isVector())
- continue;
-
- if (!mayFoldIntoVector(Value, DAG, Subtarget,
- /*AssumeSingleUse=*/true))
- return false;
- }
-
return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
/*IgnoreAlignment=*/true);
}
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index c880f39081baf..96ccc7b0f7527 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1892,114 +1892,157 @@ define i32 @blsr_u512(ptr %word) nounwind {
; X86-NEXT: popl %ebp
; X86-NEXT: retl
;
-; SSE-LABEL: blsr_u512:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 48(%rdi), %r11
-; SSE-NEXT: movq 40(%rdi), %r9
-; SSE-NEXT: movq 24(%rdi), %r8
-; SSE-NEXT: movq 16(%rdi), %rdx
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %rsi
-; SSE-NEXT: rep bsfq %rcx, %rax
-; SSE-NEXT: rep bsfq %rsi, %rbx
-; SSE-NEXT: addq $64, %rbx
-; SSE-NEXT: testq %rcx, %rcx
-; SSE-NEXT: cmovneq %rax, %rbx
-; SSE-NEXT: rep bsfq %rdx, %rax
-; SSE-NEXT: rep bsfq %r8, %r10
-; SSE-NEXT: addq $64, %r10
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovneq %rax, %r10
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: subq $-128, %r10
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: orq %rsi, %rax
-; SSE-NEXT: cmovneq %rbx, %r10
-; SSE-NEXT: rep bsfq %r14, %rax
-; SSE-NEXT: rep bsfq %r9, %rbx
-; SSE-NEXT: addq $64, %rbx
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovneq %rax, %rbx
-; SSE-NEXT: rep bsfq %r11, %r15
-; SSE-NEXT: movl $64, %eax
-; SSE-NEXT: rep bsfq 56(%rdi), %rax
-; SSE-NEXT: addq $64, %rax
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovneq %r15, %rax
-; SSE-NEXT: subq $-128, %rax
-; SSE-NEXT: orq %r9, %r14
-; SSE-NEXT: cmovneq %rbx, %rax
-; SSE-NEXT: addq $256, %rax # imm = 0x100
-; SSE-NEXT: orq %r8, %rsi
-; SSE-NEXT: orq %rdx, %rcx
-; SSE-NEXT: orq %rsi, %rcx
-; SSE-NEXT: cmovneq %r10, %rax
-; SSE-NEXT: movl $-2, %edx
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: roll %cl, %edx
-; SSE-NEXT: movl %eax, %ecx
-; SSE-NEXT: shrl $3, %ecx
-; SSE-NEXT: andl $60, %ecx
-; SSE-NEXT: andl %edx, (%rdi,%rcx)
-; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: retq
+; SSE2-LABEL: blsr_u512:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: movdqa (%rdi), %xmm0
+; SSE2-NEXT: movq 48(%rdi), %r8
+; SSE2-NEXT: movq 40(%rdi), %rdx
+; SSE2-NEXT: movq 32(%rdi), %rsi
+; SSE2-NEXT: movq (%rdi), %rax
+; SSE2-NEXT: movq 8(%rdi), %r9
+; SSE2-NEXT: rep bsfq %rax, %rcx
+; SSE2-NEXT: rep bsfq %r9, %r10
+; SSE2-NEXT: addq $64, %r10
+; SSE2-NEXT: testq %rax, %rax
+; SSE2-NEXT: cmovneq %rcx, %r10
+; SSE2-NEXT: movq 16(%rdi), %r11
+; SSE2-NEXT: rep bsfq %r11, %rbx
+; SSE2-NEXT: rep bsfq 24(%rdi), %rcx
+; SSE2-NEXT: addq $64, %rcx
+; SSE2-NEXT: testq %r11, %r11
+; SSE2-NEXT: cmovneq %rbx, %rcx
+; SSE2-NEXT: subq $-128, %rcx
+; SSE2-NEXT: orq %r9, %rax
+; SSE2-NEXT: cmovneq %r10, %rcx
+; SSE2-NEXT: rep bsfq %rsi, %rax
+; SSE2-NEXT: rep bsfq %rdx, %r9
+; SSE2-NEXT: addq $64, %r9
+; SSE2-NEXT: testq %rsi, %rsi
+; SSE2-NEXT: cmovneq %rax, %r9
+; SSE2-NEXT: rep bsfq %r8, %r10
+; SSE2-NEXT: movl $64, %eax
+; SSE2-NEXT: rep bsfq 56(%rdi), %rax
+; SSE2-NEXT: addq $64, %rax
+; SSE2-NEXT: testq %r8, %r8
+; SSE2-NEXT: cmovneq %r10, %rax
+; SSE2-NEXT: subq $-128, %rax
+; SSE2-NEXT: orq %rdx, %rsi
+; SSE2-NEXT: cmovneq %r9, %rax
+; SSE2-NEXT: por 16(%rdi), %xmm0
+; SSE2-NEXT: addq $256, %rax # imm = 0x100
+; SSE2-NEXT: pxor %xmm1, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %edx
+; SSE2-NEXT: xorl $15, %edx
+; SSE2-NEXT: cmovneq %rcx, %rax
+; SSE2-NEXT: movl $-2, %edx
+; SSE2-NEXT: movl %eax, %ecx
+; SSE2-NEXT: roll %cl, %edx
+; SSE2-NEXT: movl %eax, %ecx
+; SSE2-NEXT: shrl $3, %ecx
+; SSE2-NEXT: andl $60, %ecx
+; SSE2-NEXT: andl %edx, (%rdi,%rcx)
+; SSE2-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: retq
+;
+; SSE4-LABEL: blsr_u512:
+; SSE4: # %bb.0:
+; SSE4-NEXT: movdqa (%rdi), %xmm0
+; SSE4-NEXT: movq 48(%rdi), %rdx
+; SSE4-NEXT: movq 40(%rdi), %rcx
+; SSE4-NEXT: movq (%rdi), %rax
+; SSE4-NEXT: movq 8(%rdi), %r8
+; SSE4-NEXT: rep bsfq %rax, %rsi
+; SSE4-NEXT: rep bsfq %r8, %r9
+; SSE4-NEXT: addq $64, %r9
+; SSE4-NEXT: testq %rax, %rax
+; SSE4-NEXT: cmovneq %rsi, %r9
+; SSE4-NEXT: movq 16(%rdi), %r10
+; SSE4-NEXT: rep bsfq %r10, %r11
+; SSE4-NEXT: rep bsfq 24(%rdi), %rsi
+; SSE4-NEXT: addq $64, %rsi
+; SSE4-NEXT: testq %r10, %r10
+; SSE4-NEXT: cmovneq %r11, %rsi
+; SSE4-NEXT: subq $-128, %rsi
+; SSE4-NEXT: orq %r8, %rax
+; SSE4-NEXT: cmovneq %r9, %rsi
+; SSE4-NEXT: movq 32(%rdi), %r8
+; SSE4-NEXT: rep bsfq %r8, %rax
+; SSE4-NEXT: rep bsfq %rcx, %r9
+; SSE4-NEXT: addq $64, %r9
+; SSE4-NEXT: testq %r8, %r8
+; SSE4-NEXT: cmovneq %rax, %r9
+; SSE4-NEXT: rep bsfq %rdx, %r10
+; SSE4-NEXT: movl $64, %eax
+; SSE4-NEXT: rep bsfq 56(%rdi), %rax
+; SSE4-NEXT: addq $64, %rax
+; SSE4-NEXT: testq %rdx, %rdx
+; SSE4-NEXT: cmovneq %r10, %rax
+; SSE4-NEXT: subq $-128, %rax
+; SSE4-NEXT: orq %rcx, %r8
+; SSE4-NEXT: cmovneq %r9, %rax
+; SSE4-NEXT: addq $256, %rax # imm = 0x100
+; SSE4-NEXT: por 16(%rdi), %xmm0
+; SSE4-NEXT: ptest %xmm0, %xmm0
+; SSE4-NEXT: cmovneq %rsi, %rax
+; SSE4-NEXT: movl $-2, %edx
+; SSE4-NEXT: movl %eax, %ecx
+; SSE4-NEXT: roll %cl, %edx
+; SSE4-NEXT: movl %eax, %ecx
+; SSE4-NEXT: shrl $3, %ecx
+; SSE4-NEXT: andl $60, %ecx
+; SSE4-NEXT: andl %edx, (%rdi,%rcx)
+; SSE4-NEXT: # kill: def $eax killed $eax killed $rax
+; SSE4-NEXT: retq
;
; AVX2-LABEL: blsr_u512:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 40(%rdi), %r9
-; AVX2-NEXT: movq 32(%rdi), %r10
-; AVX2-NEXT: movq 24(%rdi), %r8
-; AVX2-NEXT: movq 16(%rdi), %rdx
-; AVX2-NEXT: movq (%rdi), %rcx
-; AVX2-NEXT: movq 8(%rdi), %rsi
-; AVX2-NEXT: tzcntq %rcx, %rax
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
+; AVX2-NEXT: movq 48(%rdi), %rsi
+; AVX2-NEXT: movq 40(%rdi), %rcx
+; AVX2-NEXT: movq 32(%rdi), %rdx
+; AVX2-NEXT: movq 16(%rdi), %rax
+; AVX2-NEXT: movq (%rdi), %r9
+; AVX2-NEXT: movq 8(%rdi), %r10
+; AVX2-NEXT: tzcntq %r9, %r8
+; AVX2-NEXT: tzcntq %r10, %r11
+; AVX2-NEXT: addq $64, %r11
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovneq %r8, %r11
; AVX2-NEXT: xorl %ebx, %ebx
-; AVX2-NEXT: tzcntq %rsi, %rbx
-; AVX2-NEXT: addq $64, %rbx
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: cmovneq %rax, %rbx
+; AVX2-NEXT: tzcntq %rax, %rbx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq 24(%rdi), %r8
+; AVX2-NEXT: addq $64, %r8
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovneq %rbx, %r8
+; AVX2-NEXT: subq $-128, %r8
+; AVX2-NEXT: orq %r10, %r9
+; AVX2-NEXT: cmovneq %r11, %r8
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq %rdx, %rax
-; AVX2-NEXT: tzcntq %r8, %r11
-; AVX2-NEXT: addq $64, %r11
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: tzcntq %rcx, %r9
+; AVX2-NEXT: addq $64, %r9
; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovneq %rax, %r11
-; AVX2-NEXT: subq $-128, %r11
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: orq %rsi, %rax
-; AVX2-NEXT: cmovneq %rbx, %r11
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %r10, %rax
-; AVX2-NEXT: xorl %ebx, %ebx
-; AVX2-NEXT: tzcntq %r9, %rbx
-; AVX2-NEXT: addq $64, %rbx
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovneq %rax, %rbx
-; AVX2-NEXT: movq 48(%rdi), %r14
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %r14, %r15
+; AVX2-NEXT: cmovneq %rax, %r9
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: tzcntq %rsi, %r10
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq 56(%rdi), %rax
; AVX2-NEXT: addq $64, %rax
-; AVX2-NEXT: testq %r14, %r14
-; AVX2-NEXT: cmovneq %r15, %rax
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: cmovneq %r10, %rax
; AVX2-NEXT: subq $-128, %rax
-; AVX2-NEXT: orq %r9, %r10
-; AVX2-NEXT: cmovneq %rbx, %rax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovneq %r9, %rax
; AVX2-NEXT: addq $256, %rax # imm = 0x100
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: orq %rsi, %rcx
-; AVX2-NEXT: cmovneq %r11, %rax
+; AVX2-NEXT: vpor 16(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: cmovneq %r8, %rax
; AVX2-NEXT: movl $-2, %edx
; AVX2-NEXT: movl %eax, %ecx
; AVX2-NEXT: roll %cl, %edx
@@ -2009,8 +2052,6 @@ define i32 @blsr_u512(ptr %word) nounwind {
; AVX2-NEXT: andl %edx, (%rdi,%rcx)
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
; AVX512-LABEL: blsr_u512:
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 1e53dc01ed168..59de15b6a43a8 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -376,17 +376,17 @@ define i1 @ne_v4i256(<4 x i256> %a0) {
;
; AVX1-LABEL: ne_v4i256:
; AVX1: # %bb.0:
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT: orq %r10, %rcx
-; AVX1-NEXT: vmovq %rcx, %xmm0
-; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT: vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX1-NEXT: vpextrq $1, %xmm0, %rax
+; AVX1-NEXT: orq %rcx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm1
; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT: orq %rax, %rdx
-; AVX1-NEXT: vmovq %rdx, %xmm1
-; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT: vmovq %xmm0, %rax
+; AVX1-NEXT: orq %rdx, %rax
+; AVX1-NEXT: vmovq %rax, %xmm0
+; AVX1-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %r9
; AVX1-NEXT: orq {{[0-9]+}}(%rsp), %rsi
; AVX1-NEXT: orq %r9, %rsi
@@ -404,17 +404,17 @@ define i1 @ne_v4i256(<4 x i256> %a0) {
;
; AVX2-LABEL: ne_v4i256:
; AVX2: # %bb.0:
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: orq %r10, %rcx
-; AVX2-NEXT: vmovq %rcx, %xmm0
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT: vpextrq $1, %xmm0, %rax
+; AVX2-NEXT: orq %rcx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm1
; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: orq %rax, %rdx
-; AVX2-NEXT: vmovq %rdx, %xmm1
-; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: orq %rdx, %rax
+; AVX2-NEXT: vmovq %rax, %xmm0
+; AVX2-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9
; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rsi
; AVX2-NEXT: orq %r9, %rsi
@@ -1068,27 +1068,49 @@ define i1 @eq_i256_args(i256 %a, i256 %b) {
}
define i1 @eq_i512_args(i512 %a, i512 %b) {
-; CHECK-LABEL: eq_i512_args:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT: orq %r10, %rcx
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r9
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT: orq %r9, %rsi
-; CHECK-NEXT: orq %rcx, %rsi
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT: orq %rax, %rdx
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r8
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT: orq %r8, %rdi
-; CHECK-NEXT: orq %rdx, %rdi
-; CHECK-NEXT: orq %rsi, %rdi
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: eq_i512_args:
+; SSE: # %bb.0:
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: orq %r10, %rcx
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT: orq %r9, %rsi
+; SSE-NEXT: orq %rcx, %rsi
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: orq %rax, %rdx
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
+; SSE-NEXT: orq %r8, %rdi
+; SSE-NEXT: orq %rdx, %rdi
+; SSE-NEXT: orq %rsi, %rdi
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX-LABEL: eq_i512_args:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r9
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT: orq %r9, %rsi
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT: vpxor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rax
+; AVX-NEXT: orq %rcx, %rax
+; AVX-NEXT: orq %rsi, %rax
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %r8
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rdi
+; AVX-NEXT: orq %r8, %rdi
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT: vmovq %xmm0, %rcx
+; AVX-NEXT: orq %rdx, %rcx
+; AVX-NEXT: orq %rdi, %rcx
+; AVX-NEXT: orq %rax, %rcx
+; AVX-NEXT: sete %al
+; AVX-NEXT: retq
%r = icmp eq i512 %a, %b
ret i1 %r
}
@@ -1225,28 +1247,51 @@ define i1 @eq_i256_load_arg(ptr%p, i256 %b) {
}
define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
-; CHECK-LABEL: eq_i512_load_arg:
-; CHECK: # %bb.0:
-; CHECK-NEXT: movq 40(%rdi), %rax
-; CHECK-NEXT: movq 48(%rdi), %r10
-; CHECK-NEXT: movq 56(%rdi), %r11
-; CHECK-NEXT: xorq 24(%rdi), %r8
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT: orq %r8, %r11
-; CHECK-NEXT: xorq 8(%rdi), %rdx
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT: orq %rdx, %rax
-; CHECK-NEXT: orq %r11, %rax
-; CHECK-NEXT: xorq 32(%rdi), %r9
-; CHECK-NEXT: xorq (%rdi), %rsi
-; CHECK-NEXT: orq %r9, %rsi
-; CHECK-NEXT: xorq 16(%rdi), %rcx
-; CHECK-NEXT: xorq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: orq %rcx, %r10
-; CHECK-NEXT: orq %rsi, %r10
-; CHECK-NEXT: orq %rax, %r10
-; CHECK-NEXT: sete %al
-; CHECK-NEXT: retq
+; SSE-LABEL: eq_i512_load_arg:
+; SSE: # %bb.0:
+; SSE-NEXT: movq 40(%rdi), %rax
+; SSE-NEXT: movq 48(%rdi), %r10
+; SSE-NEXT: movq 56(%rdi), %r11
+; SSE-NEXT: xorq 24(%rdi), %r8
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: orq %r8, %r11
+; SSE-NEXT: xorq 8(%rdi), %rdx
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: orq %rdx, %rax
+; SSE-NEXT: orq %r11, %rax
+; SSE-NEXT: xorq 32(%rdi), %r9
+; SSE-NEXT: xorq (%rdi), %rsi
+; SSE-NEXT: orq %r9, %rsi
+; SSE-NEXT: xorq 16(%rdi), %rcx
+; SSE-NEXT: xorq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: orq %rcx, %r10
+; SSE-NEXT: orq %rsi, %r10
+; SSE-NEXT: orq %rax, %r10
+; SSE-NEXT: sete %al
+; SSE-NEXT: retq
+;
+; AVX-LABEL: eq_i512_load_arg:
+; AVX: # %bb.0:
+; AVX-NEXT: vmovdqa 48(%rdi), %xmm0
+; AVX-NEXT: movq 40(%rdi), %rax
+; AVX-NEXT: xorq 8(%rdi), %rdx
+; AVX-NEXT: xorq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: xorq 24(%rdi), %r8
+; AVX-NEXT: vpxor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX-NEXT: orq %r8, %rdx
+; AVX-NEXT: orq %rax, %rdx
+; AVX-NEXT: xorq 32(%rdi), %r9
+; AVX-NEXT: xorq (%rdi), %rsi
+; AVX-NEXT: orq %r9, %rsi
+; AVX-NEXT: xorq 16(%rdi), %rcx
+; AVX-NEXT: vmovq %xmm0, %rax
+; AVX-NEXT: orq %rcx, %rax
+; AVX-NEXT: orq %rsi, %rax
+; AVX-NEXT: orq %rdx, %rax
+; AVX-NEXT: sete %al
+; AVX-NEXT: retq
%a = load i512, ptr %p
%r = icmp eq i512 %a, %b
ret i1 %r
diff --git a/llvm/test/CodeGen/X86/urem-seteq.ll b/llvm/test/CodeGen/X86/urem-seteq.ll
index 72e91ce80d1a5..9441ccf2a2843 100644
--- a/llvm/test/CodeGen/X86/urem-seteq.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq.ll
@@ -369,7 +369,8 @@ define void @ossfuzz34366() {
;
; X64-LABEL: ossfuzz34366:
; X64: # %bb.0:
-; X64-NEXT: cmpq $0, (%rax)
+; X64-NEXT: movq (%rax), %rax
+; X64-NEXT: orq %rax, %rax
; X64-NEXT: sete (%rax)
; X64-NEXT: retq
%L10 = load i448, ptr undef, align 4
>From 875449af23b9710c35dd1e1ce11c2d498333804d Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 12 Mar 2026 10:19:10 +0000
Subject: [PATCH 7/8] update tests on rebase
---
llvm/test/CodeGen/X86/bit-manip-i256.ll | 263 +-
llvm/test/CodeGen/X86/bit-manip-i512.ll | 1001 ++--
llvm/test/CodeGen/X86/bitcnt-big-integer.ll | 4693 ++++++++-----------
llvm/test/CodeGen/X86/funnel-shift-i256.ll | 278 +-
llvm/test/CodeGen/X86/funnel-shift-i512.ll | 1129 ++---
llvm/test/CodeGen/X86/ucmp.ll | 2306 +++++----
6 files changed, 4303 insertions(+), 5367 deletions(-)
diff --git a/llvm/test/CodeGen/X86/bit-manip-i256.ll b/llvm/test/CodeGen/X86/bit-manip-i256.ll
index 1867f068828a2..dba8d0d3dd07f 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i256.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i256.ll
@@ -2983,67 +2983,124 @@ define i256 @isolate_msb_i256_vector(<4 x i64> %v0, i256 %idx) nounwind {
}
define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
-; SSE-LABEL: isolate_msb_i256_load:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 16(%rsi), %r8
-; SSE-NEXT: movq 24(%rsi), %r9
-; SSE-NEXT: movq (%rsi), %rax
-; SSE-NEXT: movq 8(%rsi), %rsi
-; SSE-NEXT: movq %rsi, %rdx
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: bsrq %rax, %rcx
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: bsrq %r9, %r10
-; SSE-NEXT: xorq $63, %r10
-; SSE-NEXT: bsrq %r8, %r11
-; SSE-NEXT: xorq $63, %r11
-; SSE-NEXT: orq $64, %r11
-; SSE-NEXT: testq %r9, %r9
-; SSE-NEXT: cmovneq %r10, %r11
-; SSE-NEXT: bsrq %rsi, %r10
-; SSE-NEXT: xorq $63, %r10
-; SSE-NEXT: xorq $63, %rcx
-; SSE-NEXT: orq $64, %rcx
-; SSE-NEXT: testq %rsi, %rsi
-; SSE-NEXT: cmovneq %r10, %rcx
-; SSE-NEXT: orq $128, %rcx
-; SSE-NEXT: orq %r9, %r8
-; SSE-NEXT: cmovneq %r11, %rcx
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000
-; SSE-NEXT: movq %rsi, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %ecx, %esi
-; SSE-NEXT: shrb $6, %sil
-; SSE-NEXT: movzbl %sil, %r8d
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -40(%rsp,%r8,8), %rsi
-; SSE-NEXT: movq -48(%rsp,%r8,8), %r9
-; SSE-NEXT: movq %r9, %r10
-; SSE-NEXT: shrdq %cl, %rsi, %r10
-; SSE-NEXT: movq -56(%rsp,%r8,8), %r11
-; SSE-NEXT: movq %r11, %rbx
-; SSE-NEXT: shrdq %cl, %r9, %rbx
-; SSE-NEXT: movq -64(%rsp,%r8,8), %r8
-; SSE-NEXT: shrq %cl, %rsi
-; SSE-NEXT: # kill: def $cl killed $cl killed $rcx
-; SSE-NEXT: shrdq %cl, %r11, %r8
-; SSE-NEXT: xorl %ecx, %ecx
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: cmoveq %rcx, %rbx
-; SSE-NEXT: cmoveq %rcx, %r10
-; SSE-NEXT: cmoveq %rcx, %r8
-; SSE-NEXT: movq %rdi, %rax
-; SSE-NEXT: cmoveq %rcx, %rsi
-; SSE-NEXT: movq %rsi, 24(%rdi)
-; SSE-NEXT: movq %r10, 16(%rdi)
-; SSE-NEXT: movq %rbx, 8(%rdi)
-; SSE-NEXT: movq %r8, (%rdi)
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: retq
+; SSE2-LABEL: isolate_msb_i256_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: movq 8(%rsi), %r9
+; SSE2-NEXT: movq 16(%rsi), %rdx
+; SSE2-NEXT: movq 24(%rsi), %r8
+; SSE2-NEXT: movdqa (%rsi), %xmm1
+; SSE2-NEXT: por 16(%rsi), %xmm1
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %eax
+; SSE2-NEXT: xorl $15, %eax
+; SSE2-NEXT: bsrq %r8, %rcx
+; SSE2-NEXT: xorq $63, %rcx
+; SSE2-NEXT: bsrq %rdx, %r10
+; SSE2-NEXT: xorq $63, %r10
+; SSE2-NEXT: orq $64, %r10
+; SSE2-NEXT: testq %r8, %r8
+; SSE2-NEXT: cmovneq %rcx, %r10
+; SSE2-NEXT: bsrq %r9, %r11
+; SSE2-NEXT: xorq $63, %r11
+; SSE2-NEXT: bsrq (%rsi), %rcx
+; SSE2-NEXT: xorq $63, %rcx
+; SSE2-NEXT: orq $64, %rcx
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: cmovneq %r11, %rcx
+; SSE2-NEXT: orq $128, %rcx
+; SSE2-NEXT: orq %r8, %rdx
+; SSE2-NEXT: cmovneq %r10, %rcx
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; SSE2-NEXT: movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movl %ecx, %edx
+; SSE2-NEXT: shrb $6, %dl
+; SSE2-NEXT: movzbl %dl, %esi
+; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq -48(%rsp,%rsi,8), %rdx
+; SSE2-NEXT: movq -56(%rsp,%rsi,8), %r8
+; SSE2-NEXT: movq %r8, %r9
+; SSE2-NEXT: shrdq %cl, %rdx, %r9
+; SSE2-NEXT: movq -64(%rsp,%rsi,8), %r10
+; SSE2-NEXT: movq %r10, %r11
+; SSE2-NEXT: shrdq %cl, %r8, %r11
+; SSE2-NEXT: movq -72(%rsp,%rsi,8), %rsi
+; SSE2-NEXT: shrq %cl, %rdx
+; SSE2-NEXT: # kill: def $cl killed $cl killed $rcx
+; SSE2-NEXT: shrdq %cl, %r10, %rsi
+; SSE2-NEXT: xorl %ecx, %ecx
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: cmoveq %rcx, %r11
+; SSE2-NEXT: cmoveq %rcx, %r9
+; SSE2-NEXT: cmoveq %rcx, %rsi
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: cmoveq %rcx, %rdx
+; SSE2-NEXT: movq %rdx, 24(%rdi)
+; SSE2-NEXT: movq %r9, 16(%rdi)
+; SSE2-NEXT: movq %r11, 8(%rdi)
+; SSE2-NEXT: movq %rsi, (%rdi)
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: isolate_msb_i256_load:
+; SSE42: # %bb.0:
+; SSE42-NEXT: movq 16(%rsi), %rax
+; SSE42-NEXT: movq 24(%rsi), %rdx
+; SSE42-NEXT: movdqa (%rsi), %xmm0
+; SSE42-NEXT: por 16(%rsi), %xmm0
+; SSE42-NEXT: bsrq %rdx, %rcx
+; SSE42-NEXT: xorq $63, %rcx
+; SSE42-NEXT: bsrq %rax, %r8
+; SSE42-NEXT: xorq $63, %r8
+; SSE42-NEXT: orq $64, %r8
+; SSE42-NEXT: testq %rdx, %rdx
+; SSE42-NEXT: cmovneq %rcx, %r8
+; SSE42-NEXT: movq 8(%rsi), %r9
+; SSE42-NEXT: bsrq %r9, %r10
+; SSE42-NEXT: bsrq (%rsi), %rcx
+; SSE42-NEXT: xorq $63, %r10
+; SSE42-NEXT: xorq $63, %rcx
+; SSE42-NEXT: orq $64, %rcx
+; SSE42-NEXT: testq %r9, %r9
+; SSE42-NEXT: cmovneq %r10, %rcx
+; SSE42-NEXT: orq $128, %rcx
+; SSE42-NEXT: orq %rdx, %rax
+; SSE42-NEXT: cmovneq %r8, %rcx
+; SSE42-NEXT: xorps %xmm1, %xmm1
+; SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE42-NEXT: movq %rax, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movl %ecx, %eax
+; SSE42-NEXT: shrb $6, %al
+; SSE42-NEXT: movzbl %al, %eax
+; SSE42-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq -48(%rsp,%rax,8), %rdx
+; SSE42-NEXT: movq -56(%rsp,%rax,8), %rsi
+; SSE42-NEXT: movq %rsi, %r8
+; SSE42-NEXT: shrdq %cl, %rdx, %r8
+; SSE42-NEXT: movq -64(%rsp,%rax,8), %r9
+; SSE42-NEXT: movq %r9, %r10
+; SSE42-NEXT: shrdq %cl, %rsi, %r10
+; SSE42-NEXT: movq -72(%rsp,%rax,8), %rsi
+; SSE42-NEXT: shrq %cl, %rdx
+; SSE42-NEXT: # kill: def $cl killed $cl killed $rcx
+; SSE42-NEXT: shrdq %cl, %r9, %rsi
+; SSE42-NEXT: xorl %ecx, %ecx
+; SSE42-NEXT: ptest %xmm0, %xmm0
+; SSE42-NEXT: cmoveq %rcx, %r10
+; SSE42-NEXT: cmoveq %rcx, %r8
+; SSE42-NEXT: cmoveq %rcx, %rsi
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: cmoveq %rcx, %rdx
+; SSE42-NEXT: movq %rdx, 24(%rdi)
+; SSE42-NEXT: movq %r8, 16(%rdi)
+; SSE42-NEXT: movq %r10, 8(%rdi)
+; SSE42-NEXT: movq %rsi, (%rdi)
+; SSE42-NEXT: retq
;
; AVX2-LABEL: isolate_msb_i256_load:
; AVX2: # %bb.0:
@@ -3098,25 +3155,17 @@ define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
;
; AVX512F-LABEL: isolate_msb_i256_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq 8(%rsi), %r8
-; AVX512F-NEXT: movq 16(%rsi), %rax
-; AVX512F-NEXT: movq 24(%rsi), %rdx
; AVX512F-NEXT: vmovdqu (%rsi), %ymm0
-; AVX512F-NEXT: lzcntq %rdx, %rcx
-; AVX512F-NEXT: lzcntq %rax, %r9
-; AVX512F-NEXT: addq $64, %r9
-; AVX512F-NEXT: testq %rdx, %rdx
-; AVX512F-NEXT: cmovneq %rcx, %r9
-; AVX512F-NEXT: lzcntq (%rsi), %rcx
; AVX512F-NEXT: vmovaps {{.*#+}} zmm1 = [0,0,0,9223372036854775808,0,0,0,0]
; AVX512F-NEXT: vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: lzcntq %r8, %rsi
-; AVX512F-NEXT: addq $64, %rcx
-; AVX512F-NEXT: testq %r8, %r8
-; AVX512F-NEXT: cmovneq %rsi, %rcx
-; AVX512F-NEXT: subq $-128, %rcx
-; AVX512F-NEXT: orq %rdx, %rax
-; AVX512F-NEXT: cmovneq %r9, %rcx
+; AVX512F-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT: kshiftlw $12, %k0, %k0
+; AVX512F-NEXT: kshiftrw $12, %k0, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %ecx
; AVX512F-NEXT: movl %ecx, %eax
; AVX512F-NEXT: shrb $6, %al
; AVX512F-NEXT: movzbl %al, %edx
@@ -3147,26 +3196,16 @@ define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
; AVX512VL-LABEL: isolate_msb_i256_load:
; AVX512VL: # %bb.0:
; AVX512VL-NEXT: vmovdqu (%rsi), %ymm0
-; AVX512VL-NEXT: movq 8(%rsi), %rax
-; AVX512VL-NEXT: movq 16(%rsi), %rdx
-; AVX512VL-NEXT: movq 24(%rsi), %r8
-; AVX512VL-NEXT: lzcntq %r8, %rcx
-; AVX512VL-NEXT: lzcntq %rdx, %r9
-; AVX512VL-NEXT: addq $64, %r9
-; AVX512VL-NEXT: testq %r8, %r8
-; AVX512VL-NEXT: cmovneq %rcx, %r9
-; AVX512VL-NEXT: lzcntq %rax, %r10
-; AVX512VL-NEXT: lzcntq (%rsi), %rcx
-; AVX512VL-NEXT: addq $64, %rcx
-; AVX512VL-NEXT: testq %rax, %rax
-; AVX512VL-NEXT: cmovneq %r10, %rcx
-; AVX512VL-NEXT: subq $-128, %rcx
-; AVX512VL-NEXT: orq %r8, %rdx
-; AVX512VL-NEXT: cmovneq %r9, %rcx
-; AVX512VL-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; AVX512VL-NEXT: vptestmq %ymm1, %ymm1, %k1
+; AVX512VL-NEXT: vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT: vpcompressq %ymm1, %ymm1 {%k1} {z}
+; AVX512VL-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
+; AVX512VL-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: vmovd %xmm1, %ecx
; AVX512VL-NEXT: movl %ecx, %eax
; AVX512VL-NEXT: shrb $6, %al
; AVX512VL-NEXT: movzbl %al, %edx
@@ -3197,26 +3236,16 @@ define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
; AVX512VBMI-LABEL: isolate_msb_i256_load:
; AVX512VBMI: # %bb.0:
; AVX512VBMI-NEXT: vmovdqu (%rsi), %ymm0
-; AVX512VBMI-NEXT: movq 8(%rsi), %rax
-; AVX512VBMI-NEXT: movq 16(%rsi), %rdx
-; AVX512VBMI-NEXT: movq 24(%rsi), %r8
-; AVX512VBMI-NEXT: lzcntq %r8, %rcx
-; AVX512VBMI-NEXT: lzcntq %rdx, %r9
-; AVX512VBMI-NEXT: addq $64, %r9
-; AVX512VBMI-NEXT: testq %r8, %r8
-; AVX512VBMI-NEXT: cmovneq %rcx, %r9
-; AVX512VBMI-NEXT: lzcntq %rax, %r10
-; AVX512VBMI-NEXT: lzcntq (%rsi), %rcx
-; AVX512VBMI-NEXT: addq $64, %rcx
-; AVX512VBMI-NEXT: testq %rax, %rax
-; AVX512VBMI-NEXT: cmovneq %r10, %rcx
-; AVX512VBMI-NEXT: subq $-128, %rcx
-; AVX512VBMI-NEXT: orq %r8, %rdx
-; AVX512VBMI-NEXT: cmovneq %r9, %rcx
-; AVX512VBMI-NEXT: vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; AVX512VBMI-NEXT: vptestmq %ymm1, %ymm1, %k1
+; AVX512VBMI-NEXT: vplzcntq %ymm1, %ymm1
+; AVX512VBMI-NEXT: vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VBMI-NEXT: vpcompressq %ymm1, %ymm1 {%k1} {z}
+; AVX512VBMI-NEXT: vxorps %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT: vmovd %xmm1, %ecx
; AVX512VBMI-NEXT: movl %ecx, %eax
; AVX512VBMI-NEXT: shrb $6, %al
; AVX512VBMI-NEXT: movzbl %al, %edx
diff --git a/llvm/test/CodeGen/X86/bit-manip-i512.ll b/llvm/test/CodeGen/X86/bit-manip-i512.ll
index 3723280d2bfa2..407df83d1b2c3 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i512.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i512.ll
@@ -1826,178 +1826,62 @@ define i512 @blsi_i512_load(ptr %p0) nounwind {
;
; AVX512F-LABEL: blsi_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %rbp
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: movq 56(%rsi), %r14
-; AVX512F-NEXT: movq 48(%rsi), %rbx
-; AVX512F-NEXT: movq 40(%rsi), %r10
-; AVX512F-NEXT: movq (%rsi), %rdi
-; AVX512F-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: movq 8(%rsi), %rcx
-; AVX512F-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT: xorl %r8d, %r8d
-; AVX512F-NEXT: negq %rdi
-; AVX512F-NEXT: movl $0, %r9d
-; AVX512F-NEXT: sbbq %rcx, %r9
-; AVX512F-NEXT: movq 16(%rsi), %r15
-; AVX512F-NEXT: movl $0, %r11d
-; AVX512F-NEXT: sbbq %r15, %r11
-; AVX512F-NEXT: movq 24(%rsi), %r12
-; AVX512F-NEXT: movl $0, %r13d
-; AVX512F-NEXT: sbbq %r12, %r13
-; AVX512F-NEXT: movq 32(%rsi), %rsi
-; AVX512F-NEXT: movl $0, %ebp
-; AVX512F-NEXT: sbbq %rsi, %rbp
-; AVX512F-NEXT: movl $0, %edx
-; AVX512F-NEXT: sbbq %r10, %rdx
-; AVX512F-NEXT: movl $0, %ecx
-; AVX512F-NEXT: sbbq %rbx, %rcx
-; AVX512F-NEXT: sbbq %r14, %r8
-; AVX512F-NEXT: andq %r14, %r8
-; AVX512F-NEXT: andq %rbx, %rcx
-; AVX512F-NEXT: andq %r10, %rdx
-; AVX512F-NEXT: andq %rsi, %rbp
-; AVX512F-NEXT: andq %r12, %r13
-; AVX512F-NEXT: andq %r15, %r11
-; AVX512F-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512F-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; AVX512F-NEXT: movq %rdi, (%rax)
-; AVX512F-NEXT: movq %r9, 8(%rax)
-; AVX512F-NEXT: movq %r11, 16(%rax)
-; AVX512F-NEXT: movq %r13, 24(%rax)
-; AVX512F-NEXT: movq %rbp, 32(%rax)
-; AVX512F-NEXT: movq %rdx, 40(%rax)
-; AVX512F-NEXT: movq %rcx, 48(%rax)
-; AVX512F-NEXT: movq %r8, 56(%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
-; AVX512F-NEXT: popq %rbp
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT: vpxorq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: blsi_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %rbp
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq 56(%rsi), %r11
-; AVX512VL-NEXT: movq 48(%rsi), %rbx
-; AVX512VL-NEXT: movq 40(%rsi), %r14
-; AVX512VL-NEXT: movq 32(%rsi), %r15
-; AVX512VL-NEXT: movq 24(%rsi), %r9
-; AVX512VL-NEXT: movq 16(%rsi), %rdx
-; AVX512VL-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT: movq (%rsi), %rcx
-; AVX512VL-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT: movq 8(%rsi), %r8
-; AVX512VL-NEXT: xorl %edi, %edi
-; AVX512VL-NEXT: movq %rcx, %rsi
-; AVX512VL-NEXT: negq %rsi
-; AVX512VL-NEXT: movl $0, %r10d
-; AVX512VL-NEXT: sbbq %r8, %r10
-; AVX512VL-NEXT: movl $0, %r12d
-; AVX512VL-NEXT: sbbq %rdx, %r12
-; AVX512VL-NEXT: movl $0, %r13d
-; AVX512VL-NEXT: sbbq %r9, %r13
-; AVX512VL-NEXT: movl $0, %ebp
-; AVX512VL-NEXT: sbbq %r15, %rbp
-; AVX512VL-NEXT: movl $0, %edx
-; AVX512VL-NEXT: sbbq %r14, %rdx
-; AVX512VL-NEXT: movl $0, %ecx
-; AVX512VL-NEXT: sbbq %rbx, %rcx
-; AVX512VL-NEXT: sbbq %r11, %rdi
-; AVX512VL-NEXT: andq %r11, %rdi
-; AVX512VL-NEXT: andq %rbx, %rcx
-; AVX512VL-NEXT: andq %r14, %rdx
-; AVX512VL-NEXT: andq %r15, %rbp
-; AVX512VL-NEXT: andq %r9, %r13
-; AVX512VL-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; AVX512VL-NEXT: andq %r8, %r10
-; AVX512VL-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512VL-NEXT: movq %rsi, (%rax)
-; AVX512VL-NEXT: movq %r10, 8(%rax)
-; AVX512VL-NEXT: movq %r12, 16(%rax)
-; AVX512VL-NEXT: movq %r13, 24(%rax)
-; AVX512VL-NEXT: movq %rbp, 32(%rax)
-; AVX512VL-NEXT: movq %rdx, 40(%rax)
-; AVX512VL-NEXT: movq %rcx, 48(%rax)
-; AVX512VL-NEXT: movq %rdi, 56(%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
-; AVX512VL-NEXT: popq %rbp
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovd %k0, %ecx
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %edx
+; AVX512VL-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT: xorl %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT: vpxorq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: blsi_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %rbp
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq 56(%rsi), %r11
-; AVX512VBMI-NEXT: movq 48(%rsi), %rbx
-; AVX512VBMI-NEXT: movq 40(%rsi), %r14
-; AVX512VBMI-NEXT: movq 32(%rsi), %r15
-; AVX512VBMI-NEXT: movq 24(%rsi), %r9
-; AVX512VBMI-NEXT: movq 16(%rsi), %rdx
-; AVX512VBMI-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT: movq (%rsi), %rcx
-; AVX512VBMI-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT: movq 8(%rsi), %r8
-; AVX512VBMI-NEXT: xorl %edi, %edi
-; AVX512VBMI-NEXT: movq %rcx, %rsi
-; AVX512VBMI-NEXT: negq %rsi
-; AVX512VBMI-NEXT: movl $0, %r10d
-; AVX512VBMI-NEXT: sbbq %r8, %r10
-; AVX512VBMI-NEXT: movl $0, %r12d
-; AVX512VBMI-NEXT: sbbq %rdx, %r12
-; AVX512VBMI-NEXT: movl $0, %r13d
-; AVX512VBMI-NEXT: sbbq %r9, %r13
-; AVX512VBMI-NEXT: movl $0, %ebp
-; AVX512VBMI-NEXT: sbbq %r15, %rbp
-; AVX512VBMI-NEXT: movl $0, %edx
-; AVX512VBMI-NEXT: sbbq %r14, %rdx
-; AVX512VBMI-NEXT: movl $0, %ecx
-; AVX512VBMI-NEXT: sbbq %rbx, %rcx
-; AVX512VBMI-NEXT: sbbq %r11, %rdi
-; AVX512VBMI-NEXT: andq %r11, %rdi
-; AVX512VBMI-NEXT: andq %rbx, %rcx
-; AVX512VBMI-NEXT: andq %r14, %rdx
-; AVX512VBMI-NEXT: andq %r15, %rbp
-; AVX512VBMI-NEXT: andq %r9, %r13
-; AVX512VBMI-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; AVX512VBMI-NEXT: andq %r8, %r10
-; AVX512VBMI-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512VBMI-NEXT: movq %rsi, (%rax)
-; AVX512VBMI-NEXT: movq %r10, 8(%rax)
-; AVX512VBMI-NEXT: movq %r12, 16(%rax)
-; AVX512VBMI-NEXT: movq %r13, 24(%rax)
-; AVX512VBMI-NEXT: movq %rbp, 32(%rax)
-; AVX512VBMI-NEXT: movq %rdx, 40(%rax)
-; AVX512VBMI-NEXT: movq %rcx, 48(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
-; AVX512VBMI-NEXT: popq %rbp
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: vptestmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %ecx
+; AVX512VBMI-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %edx
+; AVX512VBMI-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT: xorl %edx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT: vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT: vpxorq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
%neg = sub i512 0, %a0
@@ -2621,62 +2505,62 @@ define i512 @blsmsk_i512_load(ptr %p0) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512-LABEL: blsmsk_i512_load:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: movq 56(%rsi), %rcx
-; AVX512-NEXT: movq 48(%rsi), %r9
-; AVX512-NEXT: movq 40(%rsi), %r10
-; AVX512-NEXT: movq 32(%rsi), %r11
-; AVX512-NEXT: movq 24(%rsi), %rbx
-; AVX512-NEXT: movq 16(%rsi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 8(%rsi), %r14
-; AVX512-NEXT: addq $-1, %rdx
-; AVX512-NEXT: movq %r14, %rsi
-; AVX512-NEXT: adcq $-1, %rsi
-; AVX512-NEXT: adcq $-1, %r15
-; AVX512-NEXT: movq %rbx, %r12
-; AVX512-NEXT: adcq $-1, %r12
-; AVX512-NEXT: movq %r11, %r13
-; AVX512-NEXT: adcq $-1, %r13
-; AVX512-NEXT: movq %r10, %rbp
-; AVX512-NEXT: adcq $-1, %rbp
-; AVX512-NEXT: movq %r9, %r8
-; AVX512-NEXT: adcq $-1, %r8
-; AVX512-NEXT: movq %rcx, %rdi
-; AVX512-NEXT: adcq $-1, %rdi
-; AVX512-NEXT: xorq %rcx, %rdi
-; AVX512-NEXT: xorq %r9, %r8
-; AVX512-NEXT: xorq %r10, %rbp
-; AVX512-NEXT: xorq %r11, %r13
-; AVX512-NEXT: xorq %rbx, %r12
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; AVX512-NEXT: xorq %r14, %rsi
-; AVX512-NEXT: xorq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; AVX512-NEXT: movq %rdx, (%rax)
-; AVX512-NEXT: movq %rsi, 8(%rax)
-; AVX512-NEXT: movq %r15, 16(%rax)
-; AVX512-NEXT: movq %r12, 24(%rax)
-; AVX512-NEXT: movq %r13, 32(%rax)
-; AVX512-NEXT: movq %rbp, 40(%rax)
-; AVX512-NEXT: movq %r8, 48(%rax)
-; AVX512-NEXT: movq %rdi, 56(%rax)
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: retq
+; AVX512F-LABEL: blsmsk_i512_load:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: blsmsk_i512_load:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %edx
+; AVX512VL-NEXT: leal (%rcx,%rdx,2), %edx
+; AVX512VL-NEXT: xorl %ecx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VBMI-LABEL: blsmsk_i512_load:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %ecx
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %edx
+; AVX512VBMI-NEXT: leal (%rcx,%rdx,2), %edx
+; AVX512VBMI-NEXT: xorl %ecx, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpxorq %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT: vzeroupper
+; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
%dec = sub i512 %a0, 1
%res = xor i512 %a0, %dec
@@ -3299,62 +3183,62 @@ define i512 @blsr_i512_load(ptr %p0) nounwind {
; AVX2-NEXT: popq %rbp
; AVX2-NEXT: retq
;
-; AVX512-LABEL: blsr_i512_load:
-; AVX512: # %bb.0:
-; AVX512-NEXT: pushq %rbp
-; AVX512-NEXT: pushq %r15
-; AVX512-NEXT: pushq %r14
-; AVX512-NEXT: pushq %r13
-; AVX512-NEXT: pushq %r12
-; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: movq 56(%rsi), %rcx
-; AVX512-NEXT: movq 48(%rsi), %r9
-; AVX512-NEXT: movq 40(%rsi), %r10
-; AVX512-NEXT: movq 32(%rsi), %r11
-; AVX512-NEXT: movq 24(%rsi), %rbx
-; AVX512-NEXT: movq 16(%rsi), %r15
-; AVX512-NEXT: movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq (%rsi), %rdx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq 8(%rsi), %r14
-; AVX512-NEXT: addq $-1, %rdx
-; AVX512-NEXT: movq %r14, %rsi
-; AVX512-NEXT: adcq $-1, %rsi
-; AVX512-NEXT: adcq $-1, %r15
-; AVX512-NEXT: movq %rbx, %r12
-; AVX512-NEXT: adcq $-1, %r12
-; AVX512-NEXT: movq %r11, %r13
-; AVX512-NEXT: adcq $-1, %r13
-; AVX512-NEXT: movq %r10, %rbp
-; AVX512-NEXT: adcq $-1, %rbp
-; AVX512-NEXT: movq %r9, %r8
-; AVX512-NEXT: adcq $-1, %r8
-; AVX512-NEXT: movq %rcx, %rdi
-; AVX512-NEXT: adcq $-1, %rdi
-; AVX512-NEXT: andq %rcx, %rdi
-; AVX512-NEXT: andq %r9, %r8
-; AVX512-NEXT: andq %r10, %rbp
-; AVX512-NEXT: andq %r11, %r13
-; AVX512-NEXT: andq %rbx, %r12
-; AVX512-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; AVX512-NEXT: andq %r14, %rsi
-; AVX512-NEXT: andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; AVX512-NEXT: movq %rdx, (%rax)
-; AVX512-NEXT: movq %rsi, 8(%rax)
-; AVX512-NEXT: movq %r15, 16(%rax)
-; AVX512-NEXT: movq %r12, 24(%rax)
-; AVX512-NEXT: movq %r13, 32(%rax)
-; AVX512-NEXT: movq %rbp, 40(%rax)
-; AVX512-NEXT: movq %r8, 48(%rax)
-; AVX512-NEXT: movq %rdi, 56(%rax)
-; AVX512-NEXT: popq %rbx
-; AVX512-NEXT: popq %r12
-; AVX512-NEXT: popq %r13
-; AVX512-NEXT: popq %r14
-; AVX512-NEXT: popq %r15
-; AVX512-NEXT: popq %rbp
-; AVX512-NEXT: retq
+; AVX512F-LABEL: blsr_i512_load:
+; AVX512F: # %bb.0:
+; AVX512F-NEXT: movq %rdi, %rax
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT: kmovw %k0, %ecx
+; AVX512F-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT: kmovw %k0, %edx
+; AVX512F-NEXT: movzbl %dl, %edx
+; AVX512F-NEXT: leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT: xorl %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT: retq
+;
+; AVX512VL-LABEL: blsr_i512_load:
+; AVX512VL: # %bb.0:
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: movq %rdi, %rax
+; AVX512VL-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT: kmovb %k0, %ecx
+; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT: kmovd %k0, %edx
+; AVX512VL-NEXT: leal (%rcx,%rdx,2), %edx
+; AVX512VL-NEXT: xorl %ecx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
+; AVX512VL-NEXT: retq
+;
+; AVX512VBMI-LABEL: blsr_i512_load:
+; AVX512VBMI: # %bb.0:
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: movq %rdi, %rax
+; AVX512VBMI-NEXT: vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT: kmovb %k0, %ecx
+; AVX512VBMI-NEXT: vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT: vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT: vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT: kmovd %k0, %edx
+; AVX512VBMI-NEXT: leal (%rcx,%rdx,2), %edx
+; AVX512VBMI-NEXT: xorl %ecx, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT: vpandq %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT: vzeroupper
+; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
%dec = sub i512 %a0, 1
%res = and i512 %a0, %dec
@@ -5065,257 +4949,353 @@ define i512 @isolate_msb_i512_vector(<8 x i64> %v0, i512 %idx) nounwind {
}
define i512 @isolate_msb_i512_load(ptr %p0, i512 %idx) nounwind {
-; SSE-LABEL: isolate_msb_i512_load:
-; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: pushq %rax
-; SSE-NEXT: movq 32(%rsi), %rcx
-; SSE-NEXT: movq 48(%rsi), %r8
-; SSE-NEXT: movq 16(%rsi), %r11
-; SSE-NEXT: movq 40(%rsi), %r9
-; SSE-NEXT: movq (%rsi), %r15
-; SSE-NEXT: movq 8(%rsi), %r14
-; SSE-NEXT: movq 56(%rsi), %r10
-; SSE-NEXT: movq 24(%rsi), %rbx
-; SSE-NEXT: movq %rbx, %rdx
-; SSE-NEXT: orq %r10, %rdx
-; SSE-NEXT: movq %r14, %rax
-; SSE-NEXT: orq %r9, %rax
-; SSE-NEXT: orq %rdx, %rax
-; SSE-NEXT: movq %r11, %rsi
-; SSE-NEXT: orq %r8, %rsi
-; SSE-NEXT: movq %r15, %rdx
-; SSE-NEXT: orq %rcx, %rdx
-; SSE-NEXT: orq %rsi, %rdx
-; SSE-NEXT: bsrq %r10, %rsi
-; SSE-NEXT: xorq $63, %rsi
-; SSE-NEXT: bsrq %r8, %r13
-; SSE-NEXT: xorq $63, %r13
-; SSE-NEXT: orq $64, %r13
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovneq %rsi, %r13
-; SSE-NEXT: bsrq %r9, %rsi
-; SSE-NEXT: xorq $63, %rsi
-; SSE-NEXT: bsrq %rcx, %r12
-; SSE-NEXT: xorq $63, %r12
-; SSE-NEXT: orq $64, %r12
-; SSE-NEXT: testq %r9, %r9
-; SSE-NEXT: cmovneq %rsi, %r12
-; SSE-NEXT: orq $128, %r12
-; SSE-NEXT: movq %r8, %rsi
-; SSE-NEXT: orq %r10, %rsi
-; SSE-NEXT: cmovneq %r13, %r12
-; SSE-NEXT: bsrq %rbx, %rsi
-; SSE-NEXT: xorq $63, %rsi
-; SSE-NEXT: bsrq %r11, %r13
-; SSE-NEXT: xorq $63, %r13
-; SSE-NEXT: orq $64, %r13
-; SSE-NEXT: testq %rbx, %rbx
-; SSE-NEXT: cmovneq %rsi, %r13
-; SSE-NEXT: bsrq %r14, %rbp
-; SSE-NEXT: xorq $63, %rbp
-; SSE-NEXT: bsrq %r15, %rsi
-; SSE-NEXT: xorq $63, %rsi
-; SSE-NEXT: orq $64, %rsi
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovneq %rbp, %rsi
-; SSE-NEXT: orq $128, %rsi
-; SSE-NEXT: orq %rbx, %r11
-; SSE-NEXT: cmovneq %r13, %rsi
-; SSE-NEXT: orq $256, %rsi # imm = 0x100
-; SSE-NEXT: orq %r10, %r9
-; SSE-NEXT: orq %r8, %rcx
-; SSE-NEXT: orq %r9, %rcx
-; SSE-NEXT: cmovneq %r12, %rsi
-; SSE-NEXT: xorps %xmm0, %xmm0
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movl %esi, %ecx
-; SSE-NEXT: andl $63, %ecx
-; SSE-NEXT: shrl $3, %esi
-; SSE-NEXT: andl $56, %esi
-; SSE-NEXT: movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT: movq -72(%rsp,%rsi), %r8
-; SSE-NEXT: movq -80(%rsp,%rsi), %r11
-; SSE-NEXT: movq %r11, %r9
-; SSE-NEXT: shrdq %cl, %r8, %r9
-; SSE-NEXT: movq -88(%rsp,%rsi), %rbx
-; SSE-NEXT: movq %rbx, %r10
-; SSE-NEXT: shrdq %cl, %r11, %r10
-; SSE-NEXT: movq -96(%rsp,%rsi), %r14
-; SSE-NEXT: movq %r14, %r11
-; SSE-NEXT: shrdq %cl, %rbx, %r11
-; SSE-NEXT: movq -104(%rsp,%rsi), %r15
-; SSE-NEXT: movq %r15, %rbx
-; SSE-NEXT: shrdq %cl, %r14, %rbx
-; SSE-NEXT: movq -112(%rsp,%rsi), %r12
-; SSE-NEXT: movq %r12, %r14
-; SSE-NEXT: shrdq %cl, %r15, %r14
-; SSE-NEXT: movq -120(%rsp,%rsi), %r13
-; SSE-NEXT: movq %r13, %r15
-; SSE-NEXT: shrdq %cl, %r12, %r15
-; SSE-NEXT: movq -128(%rsp,%rsi), %rsi
-; SSE-NEXT: shrq %cl, %r8
-; SSE-NEXT: # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT: shrdq %cl, %r13, %rsi
-; SSE-NEXT: xorl %ecx, %ecx
-; SSE-NEXT: orq %rax, %rdx
-; SSE-NEXT: cmoveq %rcx, %r15
-; SSE-NEXT: cmoveq %rcx, %r14
-; SSE-NEXT: cmoveq %rcx, %rbx
-; SSE-NEXT: cmoveq %rcx, %r11
-; SSE-NEXT: cmoveq %rcx, %r10
-; SSE-NEXT: cmoveq %rcx, %r9
-; SSE-NEXT: cmoveq %rcx, %rsi
-; SSE-NEXT: movq %rdi, %rax
-; SSE-NEXT: cmoveq %rcx, %r8
-; SSE-NEXT: movq %r8, 56(%rdi)
-; SSE-NEXT: movq %r9, 48(%rdi)
-; SSE-NEXT: movq %r10, 40(%rdi)
-; SSE-NEXT: movq %r11, 32(%rdi)
-; SSE-NEXT: movq %rbx, 24(%rdi)
-; SSE-NEXT: movq %r14, 16(%rdi)
-; SSE-NEXT: movq %r15, 8(%rdi)
-; SSE-NEXT: movq %rsi, (%rdi)
-; SSE-NEXT: addq $8, %rsp
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
-; SSE-NEXT: retq
+; SSE2-LABEL: isolate_msb_i512_load:
+; SSE2: # %bb.0:
+; SSE2-NEXT: pushq %r15
+; SSE2-NEXT: pushq %r14
+; SSE2-NEXT: pushq %r12
+; SSE2-NEXT: pushq %rbx
+; SSE2-NEXT: pushq %rax
+; SSE2-NEXT: movq 8(%rsi), %r9
+; SSE2-NEXT: movq 16(%rsi), %rcx
+; SSE2-NEXT: movq 24(%rsi), %r8
+; SSE2-NEXT: movq 48(%rsi), %rdx
+; SSE2-NEXT: movq 56(%rsi), %r11
+; SSE2-NEXT: movdqa 32(%rsi), %xmm1
+; SSE2-NEXT: movdqa 48(%rsi), %xmm2
+; SSE2-NEXT: movdqa 16(%rsi), %xmm0
+; SSE2-NEXT: por %xmm2, %xmm0
+; SSE2-NEXT: movdqa (%rsi), %xmm3
+; SSE2-NEXT: por %xmm1, %xmm3
+; SSE2-NEXT: por %xmm0, %xmm3
+; SSE2-NEXT: pxor %xmm0, %xmm0
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT: movmskps %xmm3, %eax
+; SSE2-NEXT: xorl $15, %eax
+; SSE2-NEXT: bsrq %r11, %r10
+; SSE2-NEXT: xorq $63, %r10
+; SSE2-NEXT: bsrq %rdx, %rbx
+; SSE2-NEXT: xorq $63, %rbx
+; SSE2-NEXT: orq $64, %rbx
+; SSE2-NEXT: testq %r11, %r11
+; SSE2-NEXT: cmovneq %r10, %rbx
+; SSE2-NEXT: movq 40(%rsi), %r14
+; SSE2-NEXT: bsrq %r14, %r15
+; SSE2-NEXT: bsrq 32(%rsi), %r10
+; SSE2-NEXT: xorq $63, %r15
+; SSE2-NEXT: xorq $63, %r10
+; SSE2-NEXT: orq $64, %r10
+; SSE2-NEXT: testq %r14, %r14
+; SSE2-NEXT: cmovneq %r15, %r10
+; SSE2-NEXT: orq $128, %r10
+; SSE2-NEXT: orq %r11, %rdx
+; SSE2-NEXT: cmovneq %rbx, %r10
+; SSE2-NEXT: bsrq %r8, %rdx
+; SSE2-NEXT: xorq $63, %rdx
+; SSE2-NEXT: bsrq %rcx, %r11
+; SSE2-NEXT: xorq $63, %r11
+; SSE2-NEXT: orq $64, %r11
+; SSE2-NEXT: testq %r8, %r8
+; SSE2-NEXT: cmovneq %rdx, %r11
+; SSE2-NEXT: bsrq %r9, %rbx
+; SSE2-NEXT: xorq $63, %rbx
+; SSE2-NEXT: bsrq (%rsi), %rdx
+; SSE2-NEXT: xorq $63, %rdx
+; SSE2-NEXT: orq $64, %rdx
+; SSE2-NEXT: testq %r9, %r9
+; SSE2-NEXT: cmovneq %rbx, %rdx
+; SSE2-NEXT: orq $128, %rdx
+; SSE2-NEXT: orq %r8, %rcx
+; SSE2-NEXT: cmovneq %r11, %rdx
+; SSE2-NEXT: orq $256, %rdx # imm = 0x100
+; SSE2-NEXT: por %xmm2, %xmm1
+; SSE2-NEXT: pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT: movmskps %xmm1, %ecx
+; SSE2-NEXT: xorl $15, %ecx
+; SSE2-NEXT: cmovneq %r10, %rdx
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE2-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movl %edx, %ecx
+; SSE2-NEXT: andl $63, %ecx
+; SSE2-NEXT: shrl $3, %edx
+; SSE2-NEXT: andl $56, %edx
+; SSE2-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT: movq -72(%rsp,%rdx), %rsi
+; SSE2-NEXT: movq -80(%rsp,%rdx), %r10
+; SSE2-NEXT: movq %r10, %r8
+; SSE2-NEXT: shrdq %cl, %rsi, %r8
+; SSE2-NEXT: movq -88(%rsp,%rdx), %r11
+; SSE2-NEXT: movq %r11, %r9
+; SSE2-NEXT: shrdq %cl, %r10, %r9
+; SSE2-NEXT: movq -96(%rsp,%rdx), %rbx
+; SSE2-NEXT: movq %rbx, %r10
+; SSE2-NEXT: shrdq %cl, %r11, %r10
+; SSE2-NEXT: movq -104(%rsp,%rdx), %r14
+; SSE2-NEXT: movq %r14, %r11
+; SSE2-NEXT: shrdq %cl, %rbx, %r11
+; SSE2-NEXT: movq -112(%rsp,%rdx), %r15
+; SSE2-NEXT: movq %r15, %rbx
+; SSE2-NEXT: shrdq %cl, %r14, %rbx
+; SSE2-NEXT: movq -120(%rsp,%rdx), %r12
+; SSE2-NEXT: movq %r12, %r14
+; SSE2-NEXT: shrdq %cl, %r15, %r14
+; SSE2-NEXT: movq -128(%rsp,%rdx), %rdx
+; SSE2-NEXT: shrq %cl, %rsi
+; SSE2-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE2-NEXT: shrdq %cl, %r12, %rdx
+; SSE2-NEXT: xorl %ecx, %ecx
+; SSE2-NEXT: testl %eax, %eax
+; SSE2-NEXT: cmoveq %rcx, %r14
+; SSE2-NEXT: cmoveq %rcx, %rbx
+; SSE2-NEXT: cmoveq %rcx, %r11
+; SSE2-NEXT: cmoveq %rcx, %r10
+; SSE2-NEXT: cmoveq %rcx, %r9
+; SSE2-NEXT: cmoveq %rcx, %r8
+; SSE2-NEXT: cmoveq %rcx, %rdx
+; SSE2-NEXT: movq %rdi, %rax
+; SSE2-NEXT: cmoveq %rcx, %rsi
+; SSE2-NEXT: movq %rsi, 56(%rdi)
+; SSE2-NEXT: movq %r8, 48(%rdi)
+; SSE2-NEXT: movq %r9, 40(%rdi)
+; SSE2-NEXT: movq %r10, 32(%rdi)
+; SSE2-NEXT: movq %r11, 24(%rdi)
+; SSE2-NEXT: movq %rbx, 16(%rdi)
+; SSE2-NEXT: movq %r14, 8(%rdi)
+; SSE2-NEXT: movq %rdx, (%rdi)
+; SSE2-NEXT: addq $8, %rsp
+; SSE2-NEXT: popq %rbx
+; SSE2-NEXT: popq %r12
+; SSE2-NEXT: popq %r14
+; SSE2-NEXT: popq %r15
+; SSE2-NEXT: retq
+;
+; SSE42-LABEL: isolate_msb_i512_load:
+; SSE42: # %bb.0:
+; SSE42-NEXT: pushq %r15
+; SSE42-NEXT: pushq %r14
+; SSE42-NEXT: pushq %rbx
+; SSE42-NEXT: movq 8(%rsi), %r8
+; SSE42-NEXT: movq 16(%rsi), %rcx
+; SSE42-NEXT: movq 24(%rsi), %rdx
+; SSE42-NEXT: movq 40(%rsi), %r11
+; SSE42-NEXT: movq 48(%rsi), %rax
+; SSE42-NEXT: movq 56(%rsi), %r10
+; SSE42-NEXT: movdqa 32(%rsi), %xmm2
+; SSE42-NEXT: movdqa 48(%rsi), %xmm0
+; SSE42-NEXT: movdqa (%rsi), %xmm1
+; SSE42-NEXT: bsrq %r10, %r9
+; SSE42-NEXT: xorq $63, %r9
+; SSE42-NEXT: bsrq %rax, %rbx
+; SSE42-NEXT: xorq $63, %rbx
+; SSE42-NEXT: orq $64, %rbx
+; SSE42-NEXT: testq %r10, %r10
+; SSE42-NEXT: cmovneq %r9, %rbx
+; SSE42-NEXT: bsrq %r11, %r14
+; SSE42-NEXT: xorq $63, %r14
+; SSE42-NEXT: bsrq 32(%rsi), %r9
+; SSE42-NEXT: xorq $63, %r9
+; SSE42-NEXT: orq $64, %r9
+; SSE42-NEXT: testq %r11, %r11
+; SSE42-NEXT: cmovneq %r14, %r9
+; SSE42-NEXT: orq $128, %r9
+; SSE42-NEXT: orq %r10, %rax
+; SSE42-NEXT: cmovneq %rbx, %r9
+; SSE42-NEXT: bsrq %rdx, %rax
+; SSE42-NEXT: xorq $63, %rax
+; SSE42-NEXT: bsrq %rcx, %r10
+; SSE42-NEXT: xorq $63, %r10
+; SSE42-NEXT: orq $64, %r10
+; SSE42-NEXT: testq %rdx, %rdx
+; SSE42-NEXT: cmovneq %rax, %r10
+; SSE42-NEXT: por %xmm2, %xmm1
+; SSE42-NEXT: bsrq %r8, %r11
+; SSE42-NEXT: bsrq (%rsi), %rax
+; SSE42-NEXT: xorq $63, %r11
+; SSE42-NEXT: xorq $63, %rax
+; SSE42-NEXT: orq $64, %rax
+; SSE42-NEXT: testq %r8, %r8
+; SSE42-NEXT: cmovneq %r11, %rax
+; SSE42-NEXT: orq $128, %rax
+; SSE42-NEXT: orq %rdx, %rcx
+; SSE42-NEXT: cmovneq %r10, %rax
+; SSE42-NEXT: orq $256, %rax # imm = 0x100
+; SSE42-NEXT: por %xmm0, %xmm2
+; SSE42-NEXT: ptest %xmm2, %xmm2
+; SSE42-NEXT: cmovneq %r9, %rax
+; SSE42-NEXT: movdqa 16(%rsi), %xmm2
+; SSE42-NEXT: xorps %xmm3, %xmm3
+; SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE42-NEXT: movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movl %eax, %ecx
+; SSE42-NEXT: andl $63, %ecx
+; SSE42-NEXT: shrl $3, %eax
+; SSE42-NEXT: andl $56, %eax
+; SSE42-NEXT: movq $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT: movq -72(%rsp,%rax), %rdx
+; SSE42-NEXT: movq -80(%rsp,%rax), %r9
+; SSE42-NEXT: movq %r9, %rsi
+; SSE42-NEXT: shrdq %cl, %rdx, %rsi
+; SSE42-NEXT: movq -88(%rsp,%rax), %r10
+; SSE42-NEXT: movq %r10, %r8
+; SSE42-NEXT: shrdq %cl, %r9, %r8
+; SSE42-NEXT: movq -96(%rsp,%rax), %r11
+; SSE42-NEXT: movq %r11, %r9
+; SSE42-NEXT: shrdq %cl, %r10, %r9
+; SSE42-NEXT: movq -104(%rsp,%rax), %rbx
+; SSE42-NEXT: movq %rbx, %r10
+; SSE42-NEXT: shrdq %cl, %r11, %r10
+; SSE42-NEXT: movq -112(%rsp,%rax), %r14
+; SSE42-NEXT: movq %r14, %r11
+; SSE42-NEXT: shrdq %cl, %rbx, %r11
+; SSE42-NEXT: movq -120(%rsp,%rax), %r15
+; SSE42-NEXT: movq %r15, %rbx
+; SSE42-NEXT: shrdq %cl, %r14, %rbx
+; SSE42-NEXT: movq -128(%rsp,%rax), %r14
+; SSE42-NEXT: shrq %cl, %rdx
+; SSE42-NEXT: # kill: def $cl killed $cl killed $ecx
+; SSE42-NEXT: shrdq %cl, %r15, %r14
+; SSE42-NEXT: por %xmm0, %xmm2
+; SSE42-NEXT: por %xmm2, %xmm1
+; SSE42-NEXT: xorl %ecx, %ecx
+; SSE42-NEXT: ptest %xmm1, %xmm1
+; SSE42-NEXT: cmoveq %rcx, %rbx
+; SSE42-NEXT: cmoveq %rcx, %r11
+; SSE42-NEXT: cmoveq %rcx, %r10
+; SSE42-NEXT: cmoveq %rcx, %r9
+; SSE42-NEXT: cmoveq %rcx, %r8
+; SSE42-NEXT: cmoveq %rcx, %rsi
+; SSE42-NEXT: cmoveq %rcx, %r14
+; SSE42-NEXT: movq %rdi, %rax
+; SSE42-NEXT: cmoveq %rcx, %rdx
+; SSE42-NEXT: movq %rdx, 56(%rdi)
+; SSE42-NEXT: movq %rsi, 48(%rdi)
+; SSE42-NEXT: movq %r8, 40(%rdi)
+; SSE42-NEXT: movq %r9, 32(%rdi)
+; SSE42-NEXT: movq %r10, 24(%rdi)
+; SSE42-NEXT: movq %r11, 16(%rdi)
+; SSE42-NEXT: movq %rbx, 8(%rdi)
+; SSE42-NEXT: movq %r14, (%rdi)
+; SSE42-NEXT: popq %rbx
+; SSE42-NEXT: popq %r14
+; SSE42-NEXT: popq %r15
+; SSE42-NEXT: retq
;
; AVX2-LABEL: isolate_msb_i512_load:
; AVX2: # %bb.0:
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 32(%rsi), %rax
-; AVX2-NEXT: movq 48(%rsi), %rcx
-; AVX2-NEXT: movq 16(%rsi), %r11
-; AVX2-NEXT: movq 40(%rsi), %r8
-; AVX2-NEXT: movq (%rsi), %r9
-; AVX2-NEXT: movq 8(%rsi), %r14
-; AVX2-NEXT: movq 56(%rsi), %r10
-; AVX2-NEXT: movq 24(%rsi), %rbx
-; AVX2-NEXT: movq %rbx, %rsi
-; AVX2-NEXT: orq %r10, %rsi
-; AVX2-NEXT: movq %r14, %rdx
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: movq %r11, %r15
-; AVX2-NEXT: orq %rcx, %r15
-; AVX2-NEXT: movq %r9, %rsi
-; AVX2-NEXT: orq %rax, %rsi
-; AVX2-NEXT: orq %r15, %rsi
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: lzcntq %r10, %r15
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: lzcntq %rcx, %r12
-; AVX2-NEXT: addq $64, %r12
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovneq %r15, %r12
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: lzcntq %r8, %r13
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: lzcntq %rax, %r15
-; AVX2-NEXT: addq $64, %r15
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovneq %r13, %r15
-; AVX2-NEXT: subq $-128, %r15
-; AVX2-NEXT: movq %rcx, %r13
-; AVX2-NEXT: orq %r10, %r13
-; AVX2-NEXT: cmovneq %r12, %r15
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: lzcntq %rbx, %r12
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: lzcntq %r11, %r13
-; AVX2-NEXT: addq $64, %r13
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovneq %r12, %r13
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: lzcntq %r14, %r12
-; AVX2-NEXT: lzcntq %r9, %r9
+; AVX2-NEXT: vmovdqu 32(%rsi), %ymm1
+; AVX2-NEXT: movq 8(%rsi), %r8
+; AVX2-NEXT: movq 16(%rsi), %rax
+; AVX2-NEXT: movq 24(%rsi), %rcx
+; AVX2-NEXT: movq 40(%rsi), %rdx
+; AVX2-NEXT: movq 48(%rsi), %r10
+; AVX2-NEXT: vpor (%rsi), %ymm1, %ymm0
+; AVX2-NEXT: movq 56(%rsi), %r11
+; AVX2-NEXT: lzcntq %r11, %r9
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r10, %rbx
+; AVX2-NEXT: addq $64, %rbx
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovneq %r9, %rbx
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq %rdx, %r14
+; AVX2-NEXT: xorl %r9d, %r9d
+; AVX2-NEXT: lzcntq 32(%rsi), %r9
; AVX2-NEXT: addq $64, %r9
-; AVX2-NEXT: testq %r14, %r14
-; AVX2-NEXT: cmovneq %r12, %r9
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovneq %r14, %r9
; AVX2-NEXT: subq $-128, %r9
-; AVX2-NEXT: orq %rbx, %r11
-; AVX2-NEXT: cmovneq %r13, %r9
-; AVX2-NEXT: addq $256, %r9 # imm = 0x100
-; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: orq %r11, %r10
+; AVX2-NEXT: cmovneq %rbx, %r9
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: lzcntq %rcx, %rdx
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %rax, %r10
+; AVX2-NEXT: addq $64, %r10
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovneq %rdx, %r10
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: lzcntq %r8, %r11
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: lzcntq (%rsi), %rdx
+; AVX2-NEXT: addq $64, %rdx
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovneq %r11, %rdx
+; AVX2-NEXT: subq $-128, %rdx
; AVX2-NEXT: orq %rcx, %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: cmovneq %r15, %r9
-; AVX2-NEXT: vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX2-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT: movl %r9d, %ecx
+; AVX2-NEXT: cmovneq %r10, %rdx
+; AVX2-NEXT: addq $256, %rdx # imm = 0x100
+; AVX2-NEXT: vpor 48(%rsi), %xmm1, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovneq %r9, %rdx
+; AVX2-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
+; AVX2-NEXT: vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT: movl %edx, %ecx
; AVX2-NEXT: andl $63, %ecx
-; AVX2-NEXT: shrl $3, %r9d
-; AVX2-NEXT: andl $56, %r9d
-; AVX2-NEXT: movq -72(%rsp,%r9), %r14
-; AVX2-NEXT: movq -80(%rsp,%r9), %rax
-; AVX2-NEXT: movq %rax, %r8
-; AVX2-NEXT: shrdq %cl, %r14, %r8
-; AVX2-NEXT: movq -88(%rsp,%r9), %rbx
-; AVX2-NEXT: movq %rbx, %r10
+; AVX2-NEXT: shrl $3, %edx
+; AVX2-NEXT: andl $56, %edx
+; AVX2-NEXT: movq -72(%rsp,%rdx), %r11
+; AVX2-NEXT: movq -80(%rsp,%rdx), %rax
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: shrdq %cl, %r11, %rsi
+; AVX2-NEXT: movq -88(%rsp,%rdx), %r10
+; AVX2-NEXT: movq %r10, %r8
+; AVX2-NEXT: shrdq %cl, %rax, %r8
+; AVX2-NEXT: movq -96(%rsp,%rdx), %rax
+; AVX2-NEXT: movq %rax, %r9
+; AVX2-NEXT: shrdq %cl, %r10, %r9
+; AVX2-NEXT: movq -104(%rsp,%rdx), %r14
+; AVX2-NEXT: movq %r14, %r10
; AVX2-NEXT: shrdq %cl, %rax, %r10
-; AVX2-NEXT: movq -96(%rsp,%r9), %rax
-; AVX2-NEXT: movq %rax, %r11
-; AVX2-NEXT: shrdq %cl, %rbx, %r11
-; AVX2-NEXT: movq -104(%rsp,%r9), %r12
-; AVX2-NEXT: movq %r12, %rbx
-; AVX2-NEXT: shrdq %cl, %rax, %rbx
-; AVX2-NEXT: movq -112(%rsp,%r9), %rax
-; AVX2-NEXT: movq %rax, %r15
-; AVX2-NEXT: shrdq %cl, %r12, %r15
-; AVX2-NEXT: movq -128(%rsp,%r9), %r12
-; AVX2-NEXT: movq -120(%rsp,%r9), %r13
-; AVX2-NEXT: movq %r13, %r9
-; AVX2-NEXT: shrdq %cl, %rax, %r9
-; AVX2-NEXT: shrdq %cl, %r13, %r12
+; AVX2-NEXT: movq -112(%rsp,%rdx), %r15
+; AVX2-NEXT: movq %r15, %rbx
+; AVX2-NEXT: shrdq %cl, %r14, %rbx
; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: xorl %edi, %edi
-; AVX2-NEXT: orq %rdx, %rsi
-; AVX2-NEXT: shrxq %rcx, %r14, %rcx
-; AVX2-NEXT: cmoveq %rdi, %r9
-; AVX2-NEXT: cmoveq %rdi, %r15
-; AVX2-NEXT: cmoveq %rdi, %rbx
-; AVX2-NEXT: cmoveq %rdi, %r11
-; AVX2-NEXT: cmoveq %rdi, %r10
-; AVX2-NEXT: cmoveq %rdi, %r8
-; AVX2-NEXT: cmoveq %rdi, %r12
-; AVX2-NEXT: cmoveq %rdi, %rcx
+; AVX2-NEXT: movq -128(%rsp,%rdx), %rdi
+; AVX2-NEXT: movq -120(%rsp,%rdx), %r14
+; AVX2-NEXT: movq %r14, %rdx
+; AVX2-NEXT: shrdq %cl, %r15, %rdx
+; AVX2-NEXT: shrdq %cl, %r14, %rdi
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: shrxq %rcx, %r11, %rcx
+; AVX2-NEXT: cmoveq %r14, %rdx
+; AVX2-NEXT: cmoveq %r14, %rbx
+; AVX2-NEXT: cmoveq %r14, %r10
+; AVX2-NEXT: cmoveq %r14, %r9
+; AVX2-NEXT: cmoveq %r14, %r8
+; AVX2-NEXT: cmoveq %r14, %rsi
+; AVX2-NEXT: cmoveq %r14, %rdi
+; AVX2-NEXT: cmoveq %r14, %rcx
; AVX2-NEXT: movq %rcx, 56(%rax)
-; AVX2-NEXT: movq %r8, 48(%rax)
-; AVX2-NEXT: movq %r10, 40(%rax)
-; AVX2-NEXT: movq %r11, 32(%rax)
-; AVX2-NEXT: movq %rbx, 24(%rax)
-; AVX2-NEXT: movq %r15, 16(%rax)
-; AVX2-NEXT: movq %r9, 8(%rax)
-; AVX2-NEXT: movq %r12, (%rax)
+; AVX2-NEXT: movq %rsi, 48(%rax)
+; AVX2-NEXT: movq %r8, 40(%rax)
+; AVX2-NEXT: movq %r9, 32(%rax)
+; AVX2-NEXT: movq %r10, 24(%rax)
+; AVX2-NEXT: movq %rbx, 16(%rax)
+; AVX2-NEXT: movq %rdx, 8(%rax)
+; AVX2-NEXT: movq %rdi, (%rax)
; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
; AVX2-NEXT: vzeroupper
@@ -5419,4 +5399,5 @@ define i512 @isolate_msb_i512_load(ptr %p0, i512 %idx) nounwind {
}
;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index a2395229eaace..4695f12707827 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -899,8 +899,6 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
;
; AVX512POPCNT-LABEL: test_ctpop_i1024:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0
-; AVX512POPCNT-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm1
; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %rax
; AVX512POPCNT-NEXT: popcntq {{[0-9]+}}(%rsp), %r10
; AVX512POPCNT-NEXT: addl %eax, %r10d
@@ -916,16 +914,11 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
; AVX512POPCNT-NEXT: addl %eax, %edx
; AVX512POPCNT-NEXT: addl %ecx, %edx
; AVX512POPCNT-NEXT: addl %r8d, %edx
-; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vpmovqb %zmm1, %xmm1
-; AVX512POPCNT-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512POPCNT-NEXT: vpsadbw %xmm2, %xmm1, %xmm1
-; AVX512POPCNT-NEXT: vmovd %xmm1, %ecx
-; AVX512POPCNT-NEXT: vpopcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT: vpopcntq {{[0-9]+}}(%rsp), %zmm0
; AVX512POPCNT-NEXT: vpmovqb %zmm0, %xmm0
-; AVX512POPCNT-NEXT: vpsadbw %xmm2, %xmm0, %xmm0
+; AVX512POPCNT-NEXT: vpxor %xmm1, %xmm1, %xmm1
+; AVX512POPCNT-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT: addl %ecx, %eax
; AVX512POPCNT-NEXT: addl %edx, %eax
; AVX512POPCNT-NEXT: retq
;
@@ -1002,17 +995,12 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
; AVX512VLPOPCNT-NEXT: popcntq %rdi, %rdx
; AVX512VLPOPCNT-NEXT: addl %eax, %edx
; AVX512VLPOPCNT-NEXT: addl %ecx, %edx
-; AVX512VLPOPCNT-NEXT: vpopcntq {{[0-9]+}}(%rsp), %ymm0
-; AVX512VLPOPCNT-NEXT: vpmovqb %ymm0, %xmm0
; AVX512VLPOPCNT-NEXT: addl %r8d, %edx
+; AVX512VLPOPCNT-NEXT: vpopcntq {{[0-9]+}}(%rsp), %zmm0
+; AVX512VLPOPCNT-NEXT: vpmovqb %zmm0, %xmm0
; AVX512VLPOPCNT-NEXT: vpxor %xmm1, %xmm1, %xmm1
; AVX512VLPOPCNT-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %ecx
-; AVX512VLPOPCNT-NEXT: vpopcntq {{[0-9]+}}(%rsp), %ymm0
-; AVX512VLPOPCNT-NEXT: vpmovqb %ymm0, %xmm0
-; AVX512VLPOPCNT-NEXT: vpsadbw %xmm1, %xmm0, %xmm0
; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512VLPOPCNT-NEXT: addl %ecx, %eax
; AVX512VLPOPCNT-NEXT: addl %edx, %eax
; AVX512VLPOPCNT-NEXT: vzeroupper
; AVX512VLPOPCNT-NEXT: retq
@@ -1883,117 +1871,100 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
define i32 @load_ctlz_i512(ptr %p0) nounwind {
; SSE-LABEL: load_ctlz_i512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 8(%rdi), %r10
-; SSE-NEXT: movq 16(%rdi), %r9
-; SSE-NEXT: movq 32(%rdi), %rcx
-; SSE-NEXT: movq 40(%rdi), %rdx
-; SSE-NEXT: movq 48(%rdi), %rsi
-; SSE-NEXT: movq 56(%rdi), %r8
-; SSE-NEXT: bsrq %r8, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %rsi, %r14
-; SSE-NEXT: xorl $63, %r14d
-; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: movdqa 32(%rdi), %xmm0
+; SSE-NEXT: movq 16(%rdi), %rcx
+; SSE-NEXT: movq 24(%rdi), %rdx
+; SSE-NEXT: movq 40(%rdi), %r8
+; SSE-NEXT: movq 48(%rdi), %rax
+; SSE-NEXT: movq 56(%rdi), %r9
+; SSE-NEXT: bsrq %r9, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rax, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: orl $64, %r10d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %esi, %r10d
+; SSE-NEXT: bsrq %r8, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: bsrq 32(%rdi), %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %eax, %r14d
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r9, %rax
+; SSE-NEXT: cmovnel %r10d, %esi
; SSE-NEXT: bsrq %rdx, %rax
; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %rcx, %r11
-; SSE-NEXT: xorl $63, %r11d
-; SSE-NEXT: orl $64, %r11d
+; SSE-NEXT: bsrq %rcx, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %r11d
-; SSE-NEXT: movq 24(%rdi), %rbx
-; SSE-NEXT: subl $-128, %r11d
-; SSE-NEXT: movq %rsi, %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: cmovnel %r14d, %r11d
-; SSE-NEXT: bsrq %rbx, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r9, %r14
-; SSE-NEXT: xorl $63, %r14d
-; SSE-NEXT: orl $64, %r14d
-; SSE-NEXT: testq %rbx, %rbx
-; SSE-NEXT: cmovnel %eax, %r14d
-; SSE-NEXT: bsrq %r10, %r15
-; SSE-NEXT: xorl $63, %r15d
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: movq 8(%rdi), %r9
+; SSE-NEXT: bsrq %r9, %r10
+; SSE-NEXT: xorl $63, %r10d
; SSE-NEXT: movl $127, %eax
; SSE-NEXT: bsrq (%rdi), %rax
; SSE-NEXT: xorl $63, %eax
; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r10d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %rbx, %r9
-; SSE-NEXT: cmovnel %r14d, %eax
-; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: orq %rsi, %rcx
; SSE-NEXT: orq %rdx, %rcx
-; SSE-NEXT: cmovnel %r11d, %eax
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por 48(%rdi), %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %esi, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
; SSE-NEXT: retq
;
; AVX2-LABEL: load_ctlz_i512:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 8(%rdi), %r10
-; AVX2-NEXT: movq 16(%rdi), %r9
-; AVX2-NEXT: movq 32(%rdi), %rcx
-; AVX2-NEXT: movq 40(%rdi), %rdx
-; AVX2-NEXT: movq 48(%rdi), %rsi
-; AVX2-NEXT: movq 56(%rdi), %r8
-; AVX2-NEXT: lzcntq %r8, %rax
-; AVX2-NEXT: xorl %ebx, %ebx
-; AVX2-NEXT: lzcntq %rsi, %rbx
-; AVX2-NEXT: addl $64, %ebx
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: movq 16(%rdi), %rcx
+; AVX2-NEXT: movq 24(%rdi), %rdx
+; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: movq 48(%rdi), %r8
+; AVX2-NEXT: movq 56(%rdi), %r9
+; AVX2-NEXT: lzcntq %r9, %rsi
+; AVX2-NEXT: lzcntq %r8, %r10
+; AVX2-NEXT: addl $64, %r10d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %esi, %r10d
+; AVX2-NEXT: lzcntq %rax, %r11
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: lzcntq 32(%rdi), %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq %rdx, %rax
-; AVX2-NEXT: lzcntq %rcx, %r11
-; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %r11d
-; AVX2-NEXT: subl $-128, %r11d
-; AVX2-NEXT: movq %rsi, %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: cmovnel %ebx, %r11d
-; AVX2-NEXT: movq 24(%rdi), %rbx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rbx, %rax
-; AVX2-NEXT: xorl %r14d, %r14d
-; AVX2-NEXT: lzcntq %r9, %r14
-; AVX2-NEXT: addl $64, %r14d
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %eax, %r14d
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: lzcntq %r10, %r15
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 8(%rdi), %r9
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %r9, %r10
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq (%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %rbx, %r9
-; AVX2-NEXT: cmovnel %r14d, %eax
-; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: orq %rsi, %rcx
; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vpor 48(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: cmovnel %esi, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_ctlz_i512:
@@ -2211,445 +2182,359 @@ define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind {
define i32 @test_ctlz_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_ctlz_i1024:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq %r9, %r11
-; SSE-NEXT: movq %r8, %r9
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %rdx, %r12
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: bsrq %r8, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r15, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: bsrq %r14, %rdx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: bsrq %r11, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq %rax, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %r10d, %r14d
+; SSE-NEXT: bsrq %r9, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq %r8, %rbx
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: orl $64, %ebx
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r10d, %ebx
+; SSE-NEXT: subl $-128, %ebx
+; SSE-NEXT: movq %rax, %r10
+; SSE-NEXT: orq %r11, %r10
+; SSE-NEXT: cmovnel %r14d, %ebx
+; SSE-NEXT: bsrq %rcx, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq %rdx, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r10d, %r14d
+; SSE-NEXT: movl $127, %r10d
+; SSE-NEXT: bsrq %rdi, %r10
+; SSE-NEXT: bsrq %rsi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: addl $64, %r10d
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %edi, %r10d
+; SSE-NEXT: subl $-128, %r10d
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: cmovnel %r14d, %r10d
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; SSE-NEXT: orq %r11, %r9
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT: orq %rax, %r8
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: addl $256, %r10d # imm = 0x100
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: cmovnel %ebx, %r10d
+; SSE-NEXT: addl $512, %r10d # imm = 0x200
+; SSE-NEXT: bsrq %rax, %rdx
; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: orl $64, %eax
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: movq %r15, %rdx
-; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: movq %r8, %r14
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: bsrq %r13, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: bsrq %rbx, %rdx
+; SSE-NEXT: bsrq %rsi, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: orl $64, %r9d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %edx, %r9d
+; SSE-NEXT: bsrq %rdi, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rdx
; SSE-NEXT: xorl $63, %edx
; SSE-NEXT: orl $64, %edx
-; SSE-NEXT: testq %r13, %r13
-; SSE-NEXT: cmovnel %ecx, %edx
-; SSE-NEXT: bsrq %r10, %rcx
-; SSE-NEXT: xorl $63, %ecx
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r8d, %edx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdi
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: bsrq %r8, %rbp
-; SSE-NEXT: xorl $63, %ebp
-; SSE-NEXT: orl $64, %ebp
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %ecx, %ebp
-; SSE-NEXT: subl $-128, %ebp
-; SSE-NEXT: movq %rbx, %rcx
-; SSE-NEXT: orq %r13, %rcx
-; SSE-NEXT: cmovnel %edx, %ebp
-; SSE-NEXT: addl $256, %ebp # imm = 0x100
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: orq %r14, %rcx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: orq %r15, %rdx
-; SSE-NEXT: orq %rcx, %rdx
-; SSE-NEXT: cmovnel %eax, %ebp
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT: bsrq %r14, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT: bsrq %r15, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: bsrq %r11, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r9, %rdx
-; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: orl $64, %edx
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %eax, %edx
; SSE-NEXT: subl $-128, %edx
-; SSE-NEXT: movq %r15, %rax
-; SSE-NEXT: orq %r14, %rax
-; SSE-NEXT: cmovnel %ecx, %edx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT: bsrq %r15, %rax
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: cmovnel %r9d, %edx
+; SSE-NEXT: bsrq %r8, %rax
; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r12, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r15, %r15
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: movl $127, %eax
-; SSE-NEXT: bsrq %rdi, %rax
-; SSE-NEXT: bsrq %rsi, %rdi
-; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: bsrq %rdi, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %eax, %esi
+; SSE-NEXT: bsrq %rcx, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %rsi, %rsi
-; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r9d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r15, %r12
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq %r14, %r11
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT: orq %r8, %rdi
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r11, %r9
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: orq %r13, %r10
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: orq %rbx, %r8
-; SSE-NEXT: addl $512, %eax # imm = 0x200
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: cmovnel %ebp, %eax
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovel %r10d, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: test_ctlz_i1024:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq %r9, %r14
-; AVX2-NEXT: movq %r8, %r11
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r12, %rcx
-; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: lzcntq %r8, %r9
-; AVX2-NEXT: addl $64, %r9d
-; AVX2-NEXT: testq %r12, %r12
-; AVX2-NEXT: cmovnel %ecx, %r9d
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: lzcntq %r10, %rsi
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %rax, %rcx
-; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r15, %rbx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %r11, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %r15, %r15
+; AVX2-NEXT: cmovnel %ebx, %r12d
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: lzcntq %r10, %r13
+; AVX2-NEXT: addl $64, %r14d
; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %esi, %ecx
-; AVX2-NEXT: subl $-128, %ecx
-; AVX2-NEXT: movq %r8, %rsi
-; AVX2-NEXT: orq %r12, %rsi
-; AVX2-NEXT: cmovnel %r9d, %ecx
-; AVX2-NEXT: xorl %edi, %edi
-; AVX2-NEXT: lzcntq %rbx, %rdi
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: lzcntq %r15, %rsi
-; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: cmovnel %r13d, %r14d
+; AVX2-NEXT: subl $-128, %r14d
+; AVX2-NEXT: orq %r15, %r11
+; AVX2-NEXT: cmovnel %r12d, %r14d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %rbx, %r10
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %rax, %r15
+; AVX2-NEXT: addl $64, %r15d
; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %edi, %esi
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: lzcntq %r13, %rbp
-; AVX2-NEXT: addl $64, %ebp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; AVX2-NEXT: xorl %edi, %edi
-; AVX2-NEXT: lzcntq %r9, %rdi
-; AVX2-NEXT: testq %r9, %r9
-; AVX2-NEXT: cmovnel %edi, %ebp
-; AVX2-NEXT: subl $-128, %ebp
-; AVX2-NEXT: movq %r15, %rdi
-; AVX2-NEXT: orq %rbx, %rdi
-; AVX2-NEXT: cmovnel %esi, %ebp
-; AVX2-NEXT: addl $256, %ebp # imm = 0x100
-; AVX2-NEXT: movq %r10, %rdi
-; AVX2-NEXT: orq %r12, %rdi
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rdi, %rsi
-; AVX2-NEXT: cmovnel %ecx, %ebp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rdi, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r12, %rcx
-; AVX2-NEXT: testq %r12, %r12
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r11, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: lzcntq %r14, %rsi
-; AVX2-NEXT: testq %r14, %r14
-; AVX2-NEXT: cmovnel %esi, %ecx
-; AVX2-NEXT: subl $-128, %ecx
-; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: orq %r12, %rsi
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: lzcntq %rdx, %rdx
-; AVX2-NEXT: addl $64, %edx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: cmovnel %r10d, %r15d
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %r11, %r12
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: addl $64, %r10d
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %r12d, %r10d
+; AVX2-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: subl $-128, %r10d
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: cmovnel %r15d, %r10d
+; AVX2-NEXT: addl $256, %r10d # imm = 0x100
+; AVX2-NEXT: vpor {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %r14d, %r10d
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %r10, %rax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %eax, %edx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: lzcntq %rbx, %rax
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %r11, %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %rbx, %rbx
+; AVX2-NEXT: cmovnel %eax, %r15d
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %r9, %rax
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq %r8, %r14
+; AVX2-NEXT: addl $64, %r14d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %eax, %r14d
+; AVX2-NEXT: subl $-128, %r14d
+; AVX2-NEXT: movq %r11, %rax
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: cmovnel %r15d, %r14d
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %rdx, %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r15d
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %rsi, %r12
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rdi, %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: lzcntq %rsi, %r8
; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: cmovnel %r12d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %r10, %rdi
-; AVX2-NEXT: cmovnel %edx, %eax
-; AVX2-NEXT: orq %r12, %r14
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r14, %r11
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r15d, %eax
; AVX2-NEXT: orq %rbx, %r9
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: orq %r15, %r13
+; AVX2-NEXT: orq %r11, %r8
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r14d, %eax
; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r9, %r13
-; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %r10d, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_ctlz_i1024:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT: vmovq %rdi, %xmm0
-; AVX512F-NEXT: vmovq %rsi, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT: vmovq %rdi, %xmm1
+; AVX512F-NEXT: vmovq %rsi, %xmm2
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vmovq %r8, %xmm1
-; AVX512F-NEXT: vmovq %r9, %xmm3
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vmovq %rcx, %xmm3
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,3,0,1]
; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: vmovd %xmm0, %ecx
+; AVX512F-NEXT: vmovq %r8, %xmm2
+; AVX512F-NEXT: vmovq %r9, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %ecx
; AVX512F-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: orq %r14, %r11
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: orq %rbx, %r10
-; AVX512F-NEXT: orq %r11, %r10
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512F-NEXT: vptest %ymm0, %ymm0
; AVX512F-NEXT: cmovel %ecx, %eax
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
; AVX512F-NEXT: retq
;
; AVX512POPCNT-LABEL: test_ctlz_i1024:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: pushq %r14
-; AVX512POPCNT-NEXT: pushq %rbx
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0
-; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
-; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
-; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm2
; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
-; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512POPCNT-NEXT: vmovq %r8, %xmm1
-; AVX512POPCNT-NEXT: vmovq %r9, %xmm3
-; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm2
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm3
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm3 = mem[2,3,0,1]
; AVX512POPCNT-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm4
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %ecx
; AVX512POPCNT-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT: orq %r14, %r11
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT: orq %rbx, %r10
-; AVX512POPCNT-NEXT: orq %r11, %r10
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %eax
+; AVX512POPCNT-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vptest %ymm0, %ymm0
; AVX512POPCNT-NEXT: cmovel %ecx, %eax
-; AVX512POPCNT-NEXT: popq %rbx
-; AVX512POPCNT-NEXT: popq %r14
; AVX512POPCNT-NEXT: retq
;
; AVX512VL-LABEL: test_ctlz_i1024:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT: vmovq %rdi, %xmm0
-; AVX512VL-NEXT: vmovq %rsi, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vmovq %rdx, %xmm1
-; AVX512VL-NEXT: vmovq %rcx, %xmm2
+; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VL-NEXT: vmovq %rdi, %xmm1
+; AVX512VL-NEXT: vmovq %rsi, %xmm2
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VL-NEXT: vmovq %r8, %xmm2
-; AVX512VL-NEXT: vmovq %r9, %xmm3
+; AVX512VL-NEXT: vmovq %rdx, %xmm2
+; AVX512VL-NEXT: vmovq %rcx, %xmm3
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT: vmovd %xmm0, %ecx
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VL-NEXT: vmovq %r8, %xmm3
+; AVX512VL-NEXT: vmovq %r9, %xmm4
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovd %xmm1, %ecx
; AVX512VL-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT: orq %r14, %r11
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: orq %rbx, %r10
-; AVX512VL-NEXT: orq %r11, %r10
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VL-NEXT: vptest %ymm0, %ymm0
; AVX512VL-NEXT: cmovel %ecx, %eax
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VLPOPCNT-LABEL: test_ctlz_i1024:
; AVX512VLPOPCNT: # %bb.0:
-; AVX512VLPOPCNT-NEXT: pushq %r14
-; AVX512VLPOPCNT-NEXT: pushq %rbx
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT: vmovq %rdi, %xmm0
-; AVX512VLPOPCNT-NEXT: vmovq %rsi, %xmm1
-; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VLPOPCNT-NEXT: vmovq %rdx, %xmm1
-; AVX512VLPOPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512VLPOPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VLPOPCNT-NEXT: vmovq %rdi, %xmm1
+; AVX512VLPOPCNT-NEXT: vmovq %rsi, %xmm2
; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VLPOPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VLPOPCNT-NEXT: vmovq %r8, %xmm2
-; AVX512VLPOPCNT-NEXT: vmovq %r9, %xmm3
+; AVX512VLPOPCNT-NEXT: vmovq %rdx, %xmm2
+; AVX512VLPOPCNT-NEXT: vmovq %rcx, %xmm3
; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VLPOPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VLPOPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %ecx
+; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VLPOPCNT-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VLPOPCNT-NEXT: vmovq %r8, %xmm3
+; AVX512VLPOPCNT-NEXT: vmovq %r9, %xmm4
+; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VLPOPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VLPOPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %ecx
; AVX512VLPOPCNT-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT: orq %r14, %r11
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT: orq %rbx, %r10
-; AVX512VLPOPCNT-NEXT: orq %r11, %r10
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512VLPOPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %eax
+; AVX512VLPOPCNT-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: vptest %ymm0, %ymm0
; AVX512VLPOPCNT-NEXT: cmovel %ecx, %eax
-; AVX512VLPOPCNT-NEXT: popq %rbx
-; AVX512VLPOPCNT-NEXT: popq %r14
; AVX512VLPOPCNT-NEXT: vzeroupper
; AVX512VLPOPCNT-NEXT: retq
%cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0)
@@ -2660,390 +2545,305 @@ define i32 @test_ctlz_i1024(i1024 %a0) nounwind {
define i32 @load_ctlz_i1024(ptr %p0) nounwind {
; SSE-LABEL: load_ctlz_i1024:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 40(%rdi), %rbp
-; SSE-NEXT: movq 64(%rdi), %rbx
-; SSE-NEXT: movq 72(%rdi), %r11
-; SSE-NEXT: movq 80(%rdi), %r12
-; SSE-NEXT: movq 88(%rdi), %r14
-; SSE-NEXT: movq 96(%rdi), %rsi
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq 72(%rdi), %rax
; SSE-NEXT: movq 104(%rdi), %r9
-; SSE-NEXT: movq 112(%rdi), %r10
+; SSE-NEXT: movq 112(%rdi), %rdx
; SSE-NEXT: movq 120(%rdi), %r8
-; SSE-NEXT: bsrq %r8, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r10, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
+; SSE-NEXT: bsrq %r8, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rdx, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: orl $64, %r11d
; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: bsrq %r9, %rdx
-; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: bsrq %rsi, %rax
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: cmovnel %esi, %r11d
+; SSE-NEXT: bsrq %r9, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq 96(%rdi), %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
; SSE-NEXT: testq %r9, %r9
-; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: movq %r10, %rdx
-; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT: cmovnel %r10d, %esi
+; SSE-NEXT: movq 80(%rdi), %r9
+; SSE-NEXT: movq 88(%rdi), %r10
+; SSE-NEXT: subl $-128, %esi
; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: bsrq %r14, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: bsrq %r12, %rdx
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: bsrq %r10, %rdx
; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: orl $64, %edx
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %ecx, %edx
-; SSE-NEXT: bsrq %r11, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: bsrq %rbx, %r15
-; SSE-NEXT: xorl $63, %r15d
-; SSE-NEXT: orl $64, %r15d
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %ecx, %r15d
-; SSE-NEXT: subl $-128, %r15d
-; SSE-NEXT: movq %r12, %rcx
-; SSE-NEXT: orq %r14, %rcx
-; SSE-NEXT: cmovnel %edx, %r15d
-; SSE-NEXT: movq 48(%rdi), %r12
-; SSE-NEXT: addl $256, %r15d # imm = 0x100
-; SSE-NEXT: movq %r9, %rcx
-; SSE-NEXT: orq %r8, %rcx
-; SSE-NEXT: movq %rsi, %rdx
-; SSE-NEXT: orq %r10, %rdx
-; SSE-NEXT: orq %rcx, %rdx
-; SSE-NEXT: movq 56(%rdi), %r13
-; SSE-NEXT: cmovnel %eax, %r15d
-; SSE-NEXT: bsrq %r13, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r12, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r13, %r13
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: movq %rbp, %r10
-; SSE-NEXT: bsrq %rbp, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: movq 32(%rdi), %r8
-; SSE-NEXT: bsrq %r8, %rbp
-; SSE-NEXT: xorl $63, %ebp
-; SSE-NEXT: orl $64, %ebp
+; SSE-NEXT: bsrq %r9, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: orl $64, %r11d
; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %eax, %ebp
-; SSE-NEXT: subl $-128, %ebp
-; SSE-NEXT: movq %r12, %rax
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: cmovnel %ecx, %ebp
-; SSE-NEXT: movq 24(%rdi), %r9
-; SSE-NEXT: bsrq %r9, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: movq 16(%rdi), %rsi
-; SSE-NEXT: bsrq %rsi, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r9, %r9
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: movl $127, %eax
-; SSE-NEXT: bsrq (%rdi), %rax
-; SSE-NEXT: movq 8(%rdi), %rdi
-; SSE-NEXT: bsrq %rdi, %rdx
+; SSE-NEXT: cmovnel %edx, %r11d
+; SSE-NEXT: bsrq %rax, %rbx
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: bsrq 64(%rdi), %rdx
; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %rdi, %rdi
-; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: movq 40(%rdi), %r8
+; SSE-NEXT: cmovnel %ebx, %edx
+; SSE-NEXT: movq 48(%rdi), %rax
+; SSE-NEXT: movdqa 112(%rdi), %xmm0
+; SSE-NEXT: movdqa 96(%rdi), %xmm1
+; SSE-NEXT: subl $-128, %edx
+; SSE-NEXT: orq %r10, %r9
+; SSE-NEXT: cmovnel %r11d, %edx
+; SSE-NEXT: addl $256, %edx # imm = 0x100
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: movq 56(%rdi), %r10
+; SSE-NEXT: cmovnel %esi, %edx
+; SSE-NEXT: bsrq %r10, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rax, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: orl $64, %r11d
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %esi, %r11d
+; SSE-NEXT: bsrq %r8, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq 32(%rdi), %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %r9d, %esi
+; SSE-NEXT: movq 16(%rdi), %r8
+; SSE-NEXT: movq 24(%rdi), %r9
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r10, %rax
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: bsrq %r9, %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: bsrq %r8, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: orl $64, %r10d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %eax, %r10d
+; SSE-NEXT: bsrq %rcx, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: movl $127, %eax
+; SSE-NEXT: bsrq (%rdi), %rax
+; SSE-NEXT: xorl $63, %eax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r11d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r9, %rsi
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq %r13, %r10
-; SSE-NEXT: orq %r12, %r8
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: cmovnel %r10d, %eax
+; SSE-NEXT: movdqa 32(%rdi), %xmm2
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: cmovnel %ebp, %eax
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT: orq %r14, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %rcx, %rbx
+; SSE-NEXT: por 48(%rdi), %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: por 80(%rdi), %xmm0
+; SSE-NEXT: por 64(%rdi), %xmm1
; SSE-NEXT: addl $512, %eax # imm = 0x200
-; SSE-NEXT: orq %r11, %rbx
-; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edx, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: load_ctlz_i1024:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 48(%rdi), %r9
-; AVX2-NEXT: movq 56(%rdi), %rbp
-; AVX2-NEXT: movq 64(%rdi), %r11
-; AVX2-NEXT: movq 72(%rdi), %r10
-; AVX2-NEXT: movq 80(%rdi), %r14
-; AVX2-NEXT: movq 88(%rdi), %rbx
-; AVX2-NEXT: movq 96(%rdi), %rdx
-; AVX2-NEXT: movq 104(%rdi), %r8
-; AVX2-NEXT: movq 112(%rdi), %rsi
-; AVX2-NEXT: movq 120(%rdi), %r15
-; AVX2-NEXT: lzcntq %r15, %rax
-; AVX2-NEXT: lzcntq %rsi, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: testq %r15, %r15
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: lzcntq %r8, %r12
+; AVX2-NEXT: movq 16(%rdi), %rcx
+; AVX2-NEXT: movq 72(%rdi), %rsi
+; AVX2-NEXT: movq 104(%rdi), %rdx
+; AVX2-NEXT: movq 112(%rdi), %r8
+; AVX2-NEXT: movq 120(%rdi), %r10
+; AVX2-NEXT: lzcntq %r10, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r8, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: lzcntq %rdx, %r11
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rdx, %rax
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: lzcntq 96(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: movq 80(%rdi), %r9
+; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: movq 88(%rdi), %r11
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: movq %rsi, %r12
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: orq %r15, %r12
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %rbx, %rcx
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: lzcntq %r14, %r13
-; AVX2-NEXT: addl $64, %r13d
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %ecx, %r13d
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r10, %rcx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: lzcntq %r11, %r12
-; AVX2-NEXT: addl $64, %r12d
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %ecx, %r12d
-; AVX2-NEXT: subl $-128, %r12d
-; AVX2-NEXT: movq %r14, %rcx
-; AVX2-NEXT: orq %rbx, %rcx
-; AVX2-NEXT: cmovnel %r13d, %r12d
-; AVX2-NEXT: addl $256, %r12d # imm = 0x100
-; AVX2-NEXT: movq %r8, %rcx
-; AVX2-NEXT: orq %r15, %rcx
-; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: cmovnel %eax, %r12d
-; AVX2-NEXT: movq %rbp, %r14
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %rbp, %rcx
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: cmovnel %ebx, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: lzcntq %r11, %rdx
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r9, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %edx, %ebx
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: lzcntq 64(%rdi), %rdx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rsi, %r8
+; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: movq 40(%rdi), %r10
+; AVX2-NEXT: cmovnel %r8d, %edx
+; AVX2-NEXT: movq 48(%rdi), %r8
+; AVX2-NEXT: subl $-128, %edx
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: movq 56(%rdi), %r9
+; AVX2-NEXT: cmovnel %ebx, %edx
+; AVX2-NEXT: vmovdqu 96(%rdi), %ymm0
+; AVX2-NEXT: addl $256, %edx # imm = 0x100
+; AVX2-NEXT: vpor 112(%rdi), %xmm0, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %eax, %edx
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq %r9, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %rbp, %rbp
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: movq 32(%rdi), %r13
-; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: lzcntq %r13, %rbp
-; AVX2-NEXT: addl $64, %ebp
-; AVX2-NEXT: movq 40(%rdi), %r8
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: lzcntq %r8, %rdx
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %edx, %ebp
-; AVX2-NEXT: subl $-128, %ebp
-; AVX2-NEXT: movq %r9, %rdx
-; AVX2-NEXT: orq %r14, %rdx
-; AVX2-NEXT: cmovnel %eax, %ebp
-; AVX2-NEXT: movq 16(%rdi), %r9
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r9, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: movq 24(%rdi), %rdx
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: lzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %eax, %r11d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: lzcntq 32(%rdi), %rsi
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rdx, %rax
-; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: movq 8(%rdi), %rsi
+; AVX2-NEXT: lzcntq %r10, %rax
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: movq 24(%rdi), %r10
+; AVX2-NEXT: cmovnel %eax, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %r10, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 8(%rdi), %r9
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: lzcntq %r9, %r11
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq (%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: lzcntq %rsi, %rdi
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r11d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %rdx, %r9
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq %r14, %r8
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT: orq %r10, %rcx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r8, %r13
-; AVX2-NEXT: cmovnel %ebp, %eax
-; AVX2-NEXT: orq %r15, %rbx
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbx, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rcx, %r11
+; AVX2-NEXT: vpor 48(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: vpor 64(%rdi), %ymm0, %ymm0
; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r10, %r11
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %edx, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_ctlz_i1024:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq 80(%rdi), %rsi
-; AVX512F-NEXT: movq 64(%rdi), %rcx
-; AVX512F-NEXT: movq 72(%rdi), %rdx
-; AVX512F-NEXT: movq 88(%rdi), %r8
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm2, %ecx
+; AVX512F-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vmovd %xmm1, %r9d
-; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: orq 120(%rdi), %r8
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: addl $512, %eax # imm = 0x200
-; AVX512F-NEXT: orq 104(%rdi), %rdx
-; AVX512F-NEXT: orq %r8, %rdx
-; AVX512F-NEXT: orq 112(%rdi), %rsi
-; AVX512F-NEXT: orq 96(%rdi), %rcx
-; AVX512F-NEXT: orq %rsi, %rcx
-; AVX512F-NEXT: orq %rdx, %rcx
-; AVX512F-NEXT: cmovnel %r9d, %eax
+; AVX512F-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vptest %ymm0, %ymm0
+; AVX512F-NEXT: cmovnel %ecx, %eax
; AVX512F-NEXT: retq
;
; AVX512POPCNT-LABEL: load_ctlz_i1024:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi
-; AVX512POPCNT-NEXT: movq 64(%rdi), %rcx
-; AVX512POPCNT-NEXT: movq 72(%rdi), %rdx
-; AVX512POPCNT-NEXT: movq 88(%rdi), %r8
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm2, %ecx
+; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT: vmovd %xmm1, %r9d
-; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT: orq 120(%rdi), %r8
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %eax
; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT: orq 104(%rdi), %rdx
-; AVX512POPCNT-NEXT: orq %r8, %rdx
-; AVX512POPCNT-NEXT: orq 112(%rdi), %rsi
-; AVX512POPCNT-NEXT: orq 96(%rdi), %rcx
-; AVX512POPCNT-NEXT: orq %rsi, %rcx
-; AVX512POPCNT-NEXT: orq %rdx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %r9d, %eax
+; AVX512POPCNT-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT: cmovnel %ecx, %eax
; AVX512POPCNT-NEXT: retq
;
; AVX512VL-LABEL: load_ctlz_i1024:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq 80(%rdi), %rsi
-; AVX512VL-NEXT: movq 64(%rdi), %rcx
-; AVX512VL-NEXT: movq 72(%rdi), %rdx
-; AVX512VL-NEXT: movq 88(%rdi), %r8
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm2, %ecx
+; AVX512VL-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm1, %r9d
-; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovd %xmm1, %eax
; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT: orq 120(%rdi), %r8
-; AVX512VL-NEXT: orq 104(%rdi), %rdx
-; AVX512VL-NEXT: orq 112(%rdi), %rsi
-; AVX512VL-NEXT: orq %r8, %rdx
-; AVX512VL-NEXT: orq 96(%rdi), %rcx
-; AVX512VL-NEXT: orq %rsi, %rcx
-; AVX512VL-NEXT: orq %rdx, %rcx
-; AVX512VL-NEXT: cmovnel %r9d, %eax
+; AVX512VL-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vptest %ymm0, %ymm0
+; AVX512VL-NEXT: cmovnel %ecx, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VLPOPCNT-LABEL: load_ctlz_i1024:
; AVX512VLPOPCNT: # %bb.0:
-; AVX512VLPOPCNT-NEXT: movq 80(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: movq 64(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: movq 72(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: movq 88(%rdi), %r8
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm2, %ecx
+; AVX512VLPOPCNT-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512VLPOPCNT-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %r9d
-; AVX512VLPOPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %eax
; AVX512VLPOPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT: orq 120(%rdi), %r8
-; AVX512VLPOPCNT-NEXT: orq 104(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: orq 112(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: orq %r8, %rdx
-; AVX512VLPOPCNT-NEXT: orq 96(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: orq %rsi, %rcx
-; AVX512VLPOPCNT-NEXT: orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT: cmovnel %r9d, %eax
+; AVX512VLPOPCNT-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: cmovnel %ecx, %eax
; AVX512VLPOPCNT-NEXT: vzeroupper
; AVX512VLPOPCNT-NEXT: retq
%a0 = load i1024, ptr %p0
@@ -3683,116 +3483,101 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
; SSE-LABEL: load_ctlz_undef_i512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 8(%rdi), %r11
-; SSE-NEXT: movq 16(%rdi), %r9
-; SSE-NEXT: movq 24(%rdi), %r10
-; SSE-NEXT: movq 32(%rdi), %rcx
-; SSE-NEXT: movq 40(%rdi), %rdx
-; SSE-NEXT: movq 48(%rdi), %rsi
-; SSE-NEXT: movq 56(%rdi), %r8
-; SSE-NEXT: bsrq %r8, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %rsi, %r14
-; SSE-NEXT: xorl $63, %r14d
-; SSE-NEXT: orl $64, %r14d
-; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %eax, %r14d
+; SSE-NEXT: movdqa 32(%rdi), %xmm0
+; SSE-NEXT: movq 8(%rdi), %rsi
+; SSE-NEXT: movq 16(%rdi), %rcx
+; SSE-NEXT: movq 24(%rdi), %rdx
+; SSE-NEXT: movq 48(%rdi), %rax
+; SSE-NEXT: movq 56(%rdi), %r9
+; SSE-NEXT: bsrq %r9, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: bsrq %rax, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: orl $64, %r10d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r8d, %r10d
+; SSE-NEXT: movq 40(%rdi), %r11
+; SSE-NEXT: bsrq %r11, %rbx
+; SSE-NEXT: bsrq 32(%rdi), %r8
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %ebx, %r8d
+; SSE-NEXT: subl $-128, %r8d
+; SSE-NEXT: orq %r9, %rax
+; SSE-NEXT: cmovnel %r10d, %r8d
; SSE-NEXT: bsrq %rdx, %rax
; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %rcx, %rbx
-; SSE-NEXT: xorl $63, %ebx
-; SSE-NEXT: orl $64, %ebx
+; SSE-NEXT: bsrq %rcx, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: orl $64, %r9d
; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %ebx
-; SSE-NEXT: subl $-128, %ebx
-; SSE-NEXT: movq %rsi, %rax
-; SSE-NEXT: orq %r8, %rax
-; SSE-NEXT: cmovnel %r14d, %ebx
-; SSE-NEXT: bsrq %r10, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r9, %r14
-; SSE-NEXT: xorl $63, %r14d
-; SSE-NEXT: orl $64, %r14d
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %eax, %r14d
-; SSE-NEXT: bsrq %r11, %r15
-; SSE-NEXT: xorl $63, %r15d
+; SSE-NEXT: cmovnel %eax, %r9d
+; SSE-NEXT: bsrq %rsi, %r10
+; SSE-NEXT: xorl $63, %r10d
; SSE-NEXT: bsrq (%rdi), %rax
; SSE-NEXT: xorl $63, %eax
; SSE-NEXT: orl $64, %eax
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r10d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r10, %r9
-; SSE-NEXT: cmovnel %r14d, %eax
-; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: orq %rsi, %rcx
; SSE-NEXT: orq %rdx, %rcx
-; SSE-NEXT: cmovnel %ebx, %eax
+; SSE-NEXT: cmovnel %r9d, %eax
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por 48(%rdi), %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %r8d, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
; SSE-NEXT: retq
;
; AVX2-LABEL: load_ctlz_undef_i512:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 8(%rdi), %r10
-; AVX2-NEXT: movq 16(%rdi), %r9
-; AVX2-NEXT: movq 32(%rdi), %rcx
-; AVX2-NEXT: movq 40(%rdi), %rdx
-; AVX2-NEXT: movq 48(%rdi), %rsi
-; AVX2-NEXT: movq 56(%rdi), %r8
-; AVX2-NEXT: lzcntq %r8, %rax
-; AVX2-NEXT: xorl %ebx, %ebx
-; AVX2-NEXT: lzcntq %rsi, %rbx
-; AVX2-NEXT: addl $64, %ebx
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: movq 16(%rdi), %rcx
+; AVX2-NEXT: movq 24(%rdi), %rdx
+; AVX2-NEXT: movq 40(%rdi), %rax
+; AVX2-NEXT: movq 48(%rdi), %r8
+; AVX2-NEXT: movq 56(%rdi), %r9
+; AVX2-NEXT: lzcntq %r9, %rsi
+; AVX2-NEXT: lzcntq %r8, %r10
+; AVX2-NEXT: addl $64, %r10d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %esi, %r10d
+; AVX2-NEXT: lzcntq %rax, %r11
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: lzcntq 32(%rdi), %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq %rdx, %rax
-; AVX2-NEXT: lzcntq %rcx, %r11
-; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %r11d
-; AVX2-NEXT: subl $-128, %r11d
-; AVX2-NEXT: movq %rsi, %rax
-; AVX2-NEXT: orq %r8, %rax
-; AVX2-NEXT: cmovnel %ebx, %r11d
-; AVX2-NEXT: movq 24(%rdi), %rbx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rbx, %rax
-; AVX2-NEXT: xorl %r14d, %r14d
-; AVX2-NEXT: lzcntq %r9, %r14
-; AVX2-NEXT: addl $64, %r14d
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %eax, %r14d
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: lzcntq %r10, %r15
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 8(%rdi), %r9
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %r9, %r10
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq (%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %rbx, %r9
-; AVX2-NEXT: cmovnel %r14d, %eax
-; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r8, %rdx
-; AVX2-NEXT: orq %rsi, %rcx
; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm0
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: vpor 48(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: cmovnel %esi, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_ctlz_undef_i512:
@@ -4001,443 +3786,354 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind {
define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
; SSE-LABEL: test_ctlz_undef_i1024:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq %r9, %r12
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: bsrq %r10, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: bsrq %rax, %rbx
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: orl $64, %ebx
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %r11d, %ebx
+; SSE-NEXT: bsrq %r9, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: bsrq %r8, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: orl $64, %r11d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r14d, %r11d
+; SSE-NEXT: subl $-128, %r11d
+; SSE-NEXT: movq %rax, %r14
+; SSE-NEXT: orq %r10, %r14
+; SSE-NEXT: cmovnel %ebx, %r11d
+; SSE-NEXT: bsrq %rcx, %rbx
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: bsrq %rdx, %r14
+; SSE-NEXT: xorl $63, %r14d
+; SSE-NEXT: orl $64, %r14d
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %ebx, %r14d
+; SSE-NEXT: bsrq %rsi, %rbx
+; SSE-NEXT: xorl $63, %ebx
+; SSE-NEXT: bsrq %rdi, %rdi
+; SSE-NEXT: xorl $63, %edi
+; SSE-NEXT: orl $64, %edi
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %ebx, %edi
+; SSE-NEXT: subl $-128, %edi
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: cmovnel %r14d, %edi
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT: orq %r10, %r9
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE-NEXT: bsrq %r11, %rax
+; SSE-NEXT: orq %rax, %r8
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: addl $256, %edi # imm = 0x100
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: cmovnel %r11d, %edi
+; SSE-NEXT: addl $512, %edi # imm = 0x200
+; SSE-NEXT: bsrq %rax, %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: bsrq %rsi, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: orl $64, %r10d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %edx, %r10d
+; SSE-NEXT: bsrq %rbx, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: xorl $63, %edx
+; SSE-NEXT: orl $64, %edx
+; SSE-NEXT: testq %rbx, %rbx
+; SSE-NEXT: cmovnel %r8d, %edx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT: subl $-128, %edx
+; SSE-NEXT: orq %rax, %rsi
+; SSE-NEXT: cmovnel %r10d, %edx
+; SSE-NEXT: bsrq %r9, %rax
; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %rsi, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: bsrq %rdx, %r10
+; SSE-NEXT: bsrq %r8, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %eax, %esi
+; SSE-NEXT: bsrq %rcx, %r10
; SSE-NEXT: xorl $63, %r10d
; SSE-NEXT: bsrq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: xorl $63, %eax
; SSE-NEXT: orl $64, %eax
-; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: testq %rcx, %rcx
; SSE-NEXT: cmovnel %r10d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: movq %rsi, %r9
-; SSE-NEXT: movq %rsi, %rbx
-; SSE-NEXT: orq %r11, %r9
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: bsrq %r15, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: bsrq %r13, %rsi
-; SSE-NEXT: xorl $63, %esi
-; SSE-NEXT: orl $64, %esi
-; SSE-NEXT: testq %r15, %r15
-; SSE-NEXT: cmovnel %ecx, %esi
-; SSE-NEXT: bsrq %r14, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; SSE-NEXT: bsrq %r9, %rbp
-; SSE-NEXT: xorl $63, %ebp
-; SSE-NEXT: orl $64, %ebp
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %ecx, %ebp
-; SSE-NEXT: movq %r8, %r10
-; SSE-NEXT: subl $-128, %ebp
-; SSE-NEXT: movq %r13, %rcx
-; SSE-NEXT: orq %r15, %rcx
-; SSE-NEXT: cmovnel %esi, %ebp
-; SSE-NEXT: addl $256, %ebp # imm = 0x100
-; SSE-NEXT: orq %r11, %rdx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE-NEXT: orq %rbx, %rsi
-; SSE-NEXT: orq %rdx, %rsi
-; SSE-NEXT: cmovnel %eax, %ebp
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: bsrq %rdx, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: bsrq %r8, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: orl $64, %eax
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: bsrq %r12, %rsi
-; SSE-NEXT: xorl $63, %esi
-; SSE-NEXT: bsrq %r10, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r12, %r12
-; SSE-NEXT: cmovnel %esi, %ecx
-; SSE-NEXT: movq %rdi, %rbx
-; SSE-NEXT: subl $-128, %ecx
-; SSE-NEXT: movq %r8, %rsi
-; SSE-NEXT: orq %rdx, %rsi
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT: bsrq %r11, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT: bsrq %r8, %rdx
-; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: orl $64, %edx
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %eax, %edx
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE-NEXT: bsrq %rdi, %rsi
-; SSE-NEXT: xorl $63, %esi
-; SSE-NEXT: bsrq %rbx, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: orl $64, %eax
-; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: orq %r9, %r8
; SSE-NEXT: cmovnel %esi, %eax
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r12
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm1
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r12, %r10
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT: orq %r15, %r14
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r9
-; SSE-NEXT: orq %r13, %r9
-; SSE-NEXT: addl $512, %eax # imm = 0x200
-; SSE-NEXT: orq %r14, %r9
-; SSE-NEXT: cmovnel %ebp, %eax
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %edx, %eax
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovel %edi, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: test_ctlz_undef_i1024:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
; AVX2-NEXT: pushq %r15
; AVX2-NEXT: pushq %r14
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq %r9, %r14
-; AVX2-NEXT: movq %r8, %r11
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r12, %rcx
-; AVX2-NEXT: xorl %r9d, %r9d
-; AVX2-NEXT: lzcntq %r8, %r9
-; AVX2-NEXT: addl $64, %r9d
-; AVX2-NEXT: testq %r12, %r12
-; AVX2-NEXT: cmovnel %ecx, %r9d
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: lzcntq %r10, %rsi
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %rax, %rcx
-; AVX2-NEXT: addl $64, %ecx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r15, %rbx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %r11, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %r15, %r15
+; AVX2-NEXT: cmovnel %ebx, %r12d
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: lzcntq %r10, %r13
+; AVX2-NEXT: addl $64, %r14d
; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %esi, %ecx
-; AVX2-NEXT: subl $-128, %ecx
-; AVX2-NEXT: movq %r8, %rsi
-; AVX2-NEXT: orq %r12, %rsi
-; AVX2-NEXT: cmovnel %r9d, %ecx
-; AVX2-NEXT: xorl %edi, %edi
-; AVX2-NEXT: lzcntq %rbx, %rdi
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: lzcntq %r15, %rsi
-; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: cmovnel %r13d, %r14d
+; AVX2-NEXT: subl $-128, %r14d
+; AVX2-NEXT: orq %r15, %r11
+; AVX2-NEXT: cmovnel %r12d, %r14d
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq %rbx, %r10
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %rax, %r15
+; AVX2-NEXT: addl $64, %r15d
; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %edi, %esi
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: lzcntq %r13, %rbp
-; AVX2-NEXT: addl $64, %ebp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; AVX2-NEXT: xorl %edi, %edi
-; AVX2-NEXT: lzcntq %r9, %rdi
+; AVX2-NEXT: cmovnel %r10d, %r15d
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %r11, %r12
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: lzcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT: addl $64, %r10d
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %r12d, %r10d
+; AVX2-NEXT: vmovdqu {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT: subl $-128, %r10d
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT: cmovnel %r15d, %r10d
+; AVX2-NEXT: addl $256, %r10d # imm = 0x100
+; AVX2-NEXT: vpor {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %r14d, %r10d
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %rbx, %rax
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %r11, %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %rbx, %rbx
+; AVX2-NEXT: cmovnel %eax, %r15d
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %r9, %rax
+; AVX2-NEXT: xorl %r14d, %r14d
+; AVX2-NEXT: lzcntq %r8, %r14
+; AVX2-NEXT: addl $64, %r14d
; AVX2-NEXT: testq %r9, %r9
-; AVX2-NEXT: cmovnel %edi, %ebp
-; AVX2-NEXT: subl $-128, %ebp
-; AVX2-NEXT: movq %r15, %rdi
-; AVX2-NEXT: orq %rbx, %rdi
-; AVX2-NEXT: cmovnel %esi, %ebp
-; AVX2-NEXT: addl $256, %ebp # imm = 0x100
-; AVX2-NEXT: movq %r10, %rdi
-; AVX2-NEXT: orq %r12, %rdi
-; AVX2-NEXT: movq %rax, %rsi
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rdi, %rsi
-; AVX2-NEXT: cmovnel %ecx, %ebp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT: cmovnel %eax, %r14d
+; AVX2-NEXT: subl $-128, %r14d
+; AVX2-NEXT: movq %r11, %rax
+; AVX2-NEXT: orq %rbx, %rax
+; AVX2-NEXT: cmovnel %r15d, %r14d
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rdi, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r12, %rcx
-; AVX2-NEXT: testq %r12, %r12
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r11, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: xorl %esi, %esi
-; AVX2-NEXT: lzcntq %r14, %rsi
-; AVX2-NEXT: testq %r14, %r14
-; AVX2-NEXT: cmovnel %esi, %ecx
-; AVX2-NEXT: subl $-128, %ecx
-; AVX2-NEXT: movq %rdi, %rsi
-; AVX2-NEXT: orq %r12, %rsi
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: movq %rdx, %rdi
-; AVX2-NEXT: lzcntq %rdx, %rdx
-; AVX2-NEXT: addl $64, %edx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT: lzcntq %rcx, %rax
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: lzcntq %rdx, %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %rcx, %rcx
+; AVX2-NEXT: cmovnel %eax, %r15d
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: lzcntq %rsi, %r12
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %r10, %rax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %eax, %edx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT: lzcntq %rax, %rax
+; AVX2-NEXT: lzcntq %rdi, %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT: lzcntq %rsi, %r8
; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: cmovnel %r12d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %r10, %rdi
-; AVX2-NEXT: cmovnel %edx, %eax
-; AVX2-NEXT: orq %r12, %r14
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r14, %r11
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r15d, %eax
; AVX2-NEXT: orq %rbx, %r9
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: orq %r15, %r13
+; AVX2-NEXT: orq %r11, %r8
+; AVX2-NEXT: addl $256, %eax # imm = 0x100
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r14d, %eax
; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r9, %r13
-; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %r10d, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
; AVX2-NEXT: popq %r14
; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: test_ctlz_undef_i1024:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT: vmovq %rdi, %xmm0
-; AVX512F-NEXT: vmovq %rsi, %xmm1
-; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT: vmovq %rdx, %xmm1
-; AVX512F-NEXT: vmovq %rcx, %xmm2
+; AVX512F-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT: vmovq %rdi, %xmm1
+; AVX512F-NEXT: vmovq %rsi, %xmm2
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512F-NEXT: vmovq %r8, %xmm2
-; AVX512F-NEXT: vmovq %r9, %xmm3
+; AVX512F-NEXT: vmovq %rdx, %xmm2
+; AVX512F-NEXT: vmovq %rcx, %xmm3
; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512F-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovd %xmm0, %ecx
+; AVX512F-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512F-NEXT: vmovq %r8, %xmm3
+; AVX512F-NEXT: vmovq %r9, %xmm4
+; AVX512F-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512F-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %ecx
; AVX512F-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT: orq %r14, %r11
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT: orq %rbx, %r10
-; AVX512F-NEXT: orq %r11, %r10
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %eax
+; AVX512F-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512F-NEXT: vptest %ymm0, %ymm0
; AVX512F-NEXT: cmovel %ecx, %eax
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r14
; AVX512F-NEXT: retq
;
; AVX512POPCNT-LABEL: test_ctlz_undef_i1024:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: pushq %r14
-; AVX512POPCNT-NEXT: pushq %rbx
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT: vmovq %rdi, %xmm0
-; AVX512POPCNT-NEXT: vmovq %rsi, %xmm1
-; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512POPCNT-NEXT: vmovq %rdx, %xmm1
-; AVX512POPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512POPCNT-NEXT: vmovq %rdi, %xmm1
+; AVX512POPCNT-NEXT: vmovq %rsi, %xmm2
; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512POPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512POPCNT-NEXT: vmovq %r8, %xmm2
-; AVX512POPCNT-NEXT: vmovq %r9, %xmm3
+; AVX512POPCNT-NEXT: vmovq %rdx, %xmm2
+; AVX512POPCNT-NEXT: vmovq %rcx, %xmm3
; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512POPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT: vmovd %xmm0, %ecx
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512POPCNT-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512POPCNT-NEXT: vmovq %r8, %xmm3
+; AVX512POPCNT-NEXT: vmovq %r9, %xmm4
+; AVX512POPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512POPCNT-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512POPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %ecx
; AVX512POPCNT-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT: orq %r14, %r11
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT: orq %rbx, %r10
-; AVX512POPCNT-NEXT: orq %r11, %r10
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %eax
+; AVX512POPCNT-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vptest %ymm0, %ymm0
; AVX512POPCNT-NEXT: cmovel %ecx, %eax
-; AVX512POPCNT-NEXT: popq %rbx
-; AVX512POPCNT-NEXT: popq %r14
; AVX512POPCNT-NEXT: retq
;
; AVX512VL-LABEL: test_ctlz_undef_i1024:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT: vmovq %rdi, %xmm0
-; AVX512VL-NEXT: vmovq %rsi, %xmm1
-; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT: vmovq %rdx, %xmm1
-; AVX512VL-NEXT: vmovq %rcx, %xmm2
+; AVX512VL-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VL-NEXT: vmovq %rdi, %xmm1
+; AVX512VL-NEXT: vmovq %rsi, %xmm2
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VL-NEXT: vmovq %r8, %xmm2
-; AVX512VL-NEXT: vmovq %r9, %xmm3
+; AVX512VL-NEXT: vmovq %rdx, %xmm2
+; AVX512VL-NEXT: vmovq %rcx, %xmm3
; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VL-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm0, %ecx
+; AVX512VL-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VL-NEXT: vmovq %r8, %xmm3
+; AVX512VL-NEXT: vmovq %r9, %xmm4
+; AVX512VL-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VL-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %ecx
; AVX512VL-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm0, %eax
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT: orq %r14, %r11
-; AVX512VL-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT: orq %rbx, %r10
-; AVX512VL-NEXT: orq %r11, %r10
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %eax
+; AVX512VL-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VL-NEXT: vptest %ymm0, %ymm0
; AVX512VL-NEXT: cmovel %ecx, %eax
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r14
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VLPOPCNT-LABEL: test_ctlz_undef_i1024:
; AVX512VLPOPCNT: # %bb.0:
-; AVX512VLPOPCNT-NEXT: pushq %r14
-; AVX512VLPOPCNT-NEXT: pushq %rbx
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT: vmovq %rdi, %xmm0
-; AVX512VLPOPCNT-NEXT: vmovq %rsi, %xmm1
-; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VLPOPCNT-NEXT: vmovq %rdx, %xmm1
-; AVX512VLPOPCNT-NEXT: vmovq %rcx, %xmm2
+; AVX512VLPOPCNT-NEXT: vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VLPOPCNT-NEXT: vmovq %rdi, %xmm1
+; AVX512VLPOPCNT-NEXT: vmovq %rsi, %xmm2
; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VLPOPCNT-NEXT: vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VLPOPCNT-NEXT: vmovq %r8, %xmm2
-; AVX512VLPOPCNT-NEXT: vmovq %r9, %xmm3
+; AVX512VLPOPCNT-NEXT: vmovq %rdx, %xmm2
+; AVX512VLPOPCNT-NEXT: vmovq %rcx, %xmm3
; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VLPOPCNT-NEXT: vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VLPOPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %ecx
+; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VLPOPCNT-NEXT: vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VLPOPCNT-NEXT: vmovq %r8, %xmm3
+; AVX512VLPOPCNT-NEXT: vmovq %r9, %xmm4
+; AVX512VLPOPCNT-NEXT: vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VLPOPCNT-NEXT: vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VLPOPCNT-NEXT: vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VLPOPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %ecx
; AVX512VLPOPCNT-NEXT: addl $512, %ecx # imm = 0x200
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT: vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT: orq %r14, %r11
-; AVX512VLPOPCNT-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT: orq %rbx, %r10
-; AVX512VLPOPCNT-NEXT: orq %r11, %r10
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm1
+; AVX512VLPOPCNT-NEXT: vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %eax
+; AVX512VLPOPCNT-NEXT: vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: vptest %ymm0, %ymm0
; AVX512VLPOPCNT-NEXT: cmovel %ecx, %eax
-; AVX512VLPOPCNT-NEXT: popq %rbx
-; AVX512VLPOPCNT-NEXT: popq %r14
; AVX512VLPOPCNT-NEXT: vzeroupper
; AVX512VLPOPCNT-NEXT: retq
%cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1)
@@ -4448,386 +4144,298 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind {
; SSE-LABEL: load_ctlz_undef_i1024:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 40(%rdi), %rbp
-; SSE-NEXT: movq 64(%rdi), %rbx
-; SSE-NEXT: movq 72(%rdi), %r11
-; SSE-NEXT: movq 80(%rdi), %r12
-; SSE-NEXT: movq 88(%rdi), %r14
-; SSE-NEXT: movq 96(%rdi), %r13
-; SSE-NEXT: movq 104(%rdi), %r9
-; SSE-NEXT: movq 112(%rdi), %r10
-; SSE-NEXT: movq 120(%rdi), %r8
-; SSE-NEXT: bsrq %r8, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r10, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: bsrq %r9, %rdx
-; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: bsrq %r13, %rax
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: orl $64, %eax
+; SSE-NEXT: movq 8(%rdi), %rcx
+; SSE-NEXT: movq 72(%rdi), %rax
+; SSE-NEXT: movq 104(%rdi), %r8
+; SSE-NEXT: movq 112(%rdi), %rdx
+; SSE-NEXT: movq 120(%rdi), %r9
+; SSE-NEXT: bsrq %r9, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rdx, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: orl $64, %r11d
; SSE-NEXT: testq %r9, %r9
-; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: movq %r10, %rdx
-; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: orq %r8, %rdx
-; SSE-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: bsrq %r14, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: movq %r12, %rsi
-; SSE-NEXT: movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: bsrq %r12, %rdx
+; SSE-NEXT: cmovnel %esi, %r11d
+; SSE-NEXT: bsrq %r8, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq 96(%rdi), %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: movq 80(%rdi), %r8
+; SSE-NEXT: cmovnel %r10d, %esi
+; SSE-NEXT: movq 88(%rdi), %r10
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r9, %rdx
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: bsrq %r10, %rdx
; SSE-NEXT: xorl $63, %edx
-; SSE-NEXT: orl $64, %edx
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %ecx, %edx
-; SSE-NEXT: bsrq %r11, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: bsrq %rbx, %r15
-; SSE-NEXT: xorl $63, %r15d
-; SSE-NEXT: orl $64, %r15d
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %ecx, %r15d
-; SSE-NEXT: movq 48(%rdi), %r12
-; SSE-NEXT: subl $-128, %r15d
-; SSE-NEXT: movq %rsi, %rcx
-; SSE-NEXT: orq %r14, %rcx
-; SSE-NEXT: cmovnel %edx, %r15d
-; SSE-NEXT: addl $256, %r15d # imm = 0x100
-; SSE-NEXT: movq %r9, %rcx
-; SSE-NEXT: orq %r8, %rcx
-; SSE-NEXT: movq %r13, %rdx
-; SSE-NEXT: orq %r10, %rdx
-; SSE-NEXT: orq %rcx, %rdx
-; SSE-NEXT: movq 56(%rdi), %r13
-; SSE-NEXT: cmovnel %eax, %r15d
-; SSE-NEXT: bsrq %r13, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: bsrq %r12, %rdx
+; SSE-NEXT: bsrq %r8, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: orl $64, %r11d
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %edx, %r11d
+; SSE-NEXT: bsrq %rax, %r9
+; SSE-NEXT: xorl $63, %r9d
+; SSE-NEXT: bsrq 64(%rdi), %rdx
; SSE-NEXT: xorl $63, %edx
; SSE-NEXT: orl $64, %edx
-; SSE-NEXT: testq %r13, %r13
-; SSE-NEXT: cmovnel %eax, %edx
-; SSE-NEXT: movq %rbp, %r10
-; SSE-NEXT: bsrq %rbp, %rax
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %r9d, %edx
+; SSE-NEXT: movq 40(%rdi), %r9
+; SSE-NEXT: movq 48(%rdi), %rax
+; SSE-NEXT: subl $-128, %edx
+; SSE-NEXT: orq %r10, %r8
+; SSE-NEXT: movq 56(%rdi), %r8
+; SSE-NEXT: movdqa 112(%rdi), %xmm0
+; SSE-NEXT: cmovnel %r11d, %edx
+; SSE-NEXT: movdqa 96(%rdi), %xmm1
+; SSE-NEXT: addl $256, %edx # imm = 0x100
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %edx
+; SSE-NEXT: bsrq %r8, %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: bsrq %rax, %r11
+; SSE-NEXT: xorl $63, %r11d
+; SSE-NEXT: orl $64, %r11d
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %esi, %r11d
+; SSE-NEXT: bsrq %r9, %r10
+; SSE-NEXT: xorl $63, %r10d
+; SSE-NEXT: bsrq 32(%rdi), %rsi
+; SSE-NEXT: xorl $63, %esi
+; SSE-NEXT: orl $64, %esi
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: movq 16(%rdi), %r9
+; SSE-NEXT: cmovnel %r10d, %esi
+; SSE-NEXT: movq 24(%rdi), %r10
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: bsrq %r10, %rax
; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: movq 32(%rdi), %r8
-; SSE-NEXT: bsrq %r8, %rbp
-; SSE-NEXT: xorl $63, %ebp
-; SSE-NEXT: orl $64, %ebp
+; SSE-NEXT: bsrq %r9, %r8
+; SSE-NEXT: xorl $63, %r8d
+; SSE-NEXT: orl $64, %r8d
; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %eax, %ebp
-; SSE-NEXT: subl $-128, %ebp
-; SSE-NEXT: movq %r12, %rax
-; SSE-NEXT: orq %r13, %rax
-; SSE-NEXT: cmovnel %edx, %ebp
-; SSE-NEXT: movq 24(%rdi), %r9
-; SSE-NEXT: bsrq %r9, %rax
-; SSE-NEXT: xorl $63, %eax
-; SSE-NEXT: movq 16(%rdi), %rsi
-; SSE-NEXT: bsrq %rsi, %rcx
-; SSE-NEXT: xorl $63, %ecx
-; SSE-NEXT: orl $64, %ecx
-; SSE-NEXT: testq %r9, %r9
-; SSE-NEXT: cmovnel %eax, %ecx
-; SSE-NEXT: movq 8(%rdi), %rdx
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: bsrq %rcx, %r11
+; SSE-NEXT: xorl $63, %r11d
; SSE-NEXT: bsrq (%rdi), %rax
-; SSE-NEXT: bsrq %rdx, %rdi
-; SSE-NEXT: xorl $63, %edi
; SSE-NEXT: xorl $63, %eax
; SSE-NEXT: orl $64, %eax
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %edi, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r11d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r9, %rsi
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq %r13, %r10
-; SSE-NEXT: orq %r12, %r8
+; SSE-NEXT: orq %r10, %r9
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: movdqa 32(%rdi), %xmm2
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r10, %r8
-; SSE-NEXT: cmovnel %ebp, %eax
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT: orq %r14, %r11
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT: orq %rcx, %rbx
+; SSE-NEXT: por 48(%rdi), %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: por 80(%rdi), %xmm0
+; SSE-NEXT: por 64(%rdi), %xmm1
; SSE-NEXT: addl $512, %eax # imm = 0x200
-; SSE-NEXT: orq %r11, %rbx
-; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edx, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: load_ctlz_undef_i1024:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 48(%rdi), %r9
-; AVX2-NEXT: movq 56(%rdi), %rbp
-; AVX2-NEXT: movq 64(%rdi), %r11
-; AVX2-NEXT: movq 72(%rdi), %r10
-; AVX2-NEXT: movq 80(%rdi), %r14
-; AVX2-NEXT: movq 88(%rdi), %rbx
-; AVX2-NEXT: movq 96(%rdi), %rdx
-; AVX2-NEXT: movq 104(%rdi), %r8
-; AVX2-NEXT: movq 112(%rdi), %rsi
-; AVX2-NEXT: movq 120(%rdi), %r15
-; AVX2-NEXT: lzcntq %r15, %rax
-; AVX2-NEXT: lzcntq %rsi, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: testq %r15, %r15
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: lzcntq %r8, %r12
+; AVX2-NEXT: movq 16(%rdi), %rcx
+; AVX2-NEXT: movq 72(%rdi), %rsi
+; AVX2-NEXT: movq 104(%rdi), %rdx
+; AVX2-NEXT: movq 112(%rdi), %r8
+; AVX2-NEXT: movq 120(%rdi), %r10
+; AVX2-NEXT: lzcntq %r10, %rax
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r8, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: lzcntq %rdx, %r11
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rdx, %rax
-; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: lzcntq 96(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: movq 80(%rdi), %r9
+; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: movq 88(%rdi), %r11
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: movq %rsi, %r12
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: orq %r15, %r12
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %rbx, %rcx
-; AVX2-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: lzcntq %r14, %r13
-; AVX2-NEXT: addl $64, %r13d
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %ecx, %r13d
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r10, %rcx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: lzcntq %r11, %r12
-; AVX2-NEXT: addl $64, %r12d
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %ecx, %r12d
-; AVX2-NEXT: subl $-128, %r12d
-; AVX2-NEXT: movq %r14, %rcx
-; AVX2-NEXT: orq %rbx, %rcx
-; AVX2-NEXT: cmovnel %r13d, %r12d
-; AVX2-NEXT: addl $256, %r12d # imm = 0x100
-; AVX2-NEXT: movq %r8, %rcx
-; AVX2-NEXT: orq %r15, %rcx
-; AVX2-NEXT: orq %rsi, %rdx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: cmovnel %eax, %r12d
-; AVX2-NEXT: movq %rbp, %r14
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %rbp, %rcx
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: orq %r10, %r8
+; AVX2-NEXT: cmovnel %ebx, %eax
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: lzcntq %r11, %rdx
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: lzcntq %r9, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %edx, %ebx
+; AVX2-NEXT: xorl %edx, %edx
+; AVX2-NEXT: lzcntq 64(%rdi), %rdx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rsi, %r8
+; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: movq 40(%rdi), %r10
+; AVX2-NEXT: cmovnel %r8d, %edx
+; AVX2-NEXT: movq 48(%rdi), %r8
+; AVX2-NEXT: subl $-128, %edx
+; AVX2-NEXT: orq %r11, %r9
+; AVX2-NEXT: movq 56(%rdi), %r9
+; AVX2-NEXT: cmovnel %ebx, %edx
+; AVX2-NEXT: vmovdqu 96(%rdi), %ymm0
+; AVX2-NEXT: addl $256, %edx # imm = 0x100
+; AVX2-NEXT: vpor 112(%rdi), %xmm0, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %eax, %edx
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq %r9, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %rbp, %rbp
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: movq 32(%rdi), %r13
-; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: lzcntq %r13, %rbp
-; AVX2-NEXT: addl $64, %ebp
-; AVX2-NEXT: movq 40(%rdi), %r8
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: lzcntq %r8, %rdx
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %edx, %ebp
-; AVX2-NEXT: subl $-128, %ebp
-; AVX2-NEXT: movq %r9, %rdx
-; AVX2-NEXT: orq %r14, %rdx
-; AVX2-NEXT: cmovnel %eax, %ebp
-; AVX2-NEXT: movq 16(%rdi), %r9
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: lzcntq %r9, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: movq 24(%rdi), %rdx
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: lzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %eax, %r11d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: lzcntq 32(%rdi), %rsi
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: lzcntq %rdx, %rax
-; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: movq 8(%rdi), %rsi
+; AVX2-NEXT: lzcntq %r10, %rax
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: movq 24(%rdi), %r10
+; AVX2-NEXT: cmovnel %eax, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: lzcntq %r10, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: lzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 8(%rdi), %r9
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: lzcntq %r9, %r11
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: lzcntq (%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: lzcntq %rsi, %rdi
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r11d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %rdx, %r9
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq %r14, %r8
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT: orq %r10, %rcx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa 32(%rdi), %xmm1
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r8, %r13
-; AVX2-NEXT: cmovnel %ebp, %eax
-; AVX2-NEXT: orq %r15, %rbx
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbx, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rcx, %r11
+; AVX2-NEXT: vpor 48(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: vpor 64(%rdi), %ymm0, %ymm0
; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r10, %r11
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %edx, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_ctlz_undef_i1024:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq 80(%rdi), %rsi
-; AVX512F-NEXT: movq 64(%rdi), %rcx
-; AVX512F-NEXT: movq 72(%rdi), %rdx
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512F-NEXT: movq 88(%rdi), %r8
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm2, %ecx
+; AVX512F-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512F-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512F-NEXT: vmovd %xmm1, %r9d
-; AVX512F-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovd %xmm0, %eax
-; AVX512F-NEXT: orq 120(%rdi), %r8
+; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: addl $512, %eax # imm = 0x200
-; AVX512F-NEXT: orq 104(%rdi), %rdx
-; AVX512F-NEXT: orq %r8, %rdx
-; AVX512F-NEXT: orq 112(%rdi), %rsi
-; AVX512F-NEXT: orq 96(%rdi), %rcx
-; AVX512F-NEXT: orq %rsi, %rcx
-; AVX512F-NEXT: orq %rdx, %rcx
-; AVX512F-NEXT: cmovnel %r9d, %eax
+; AVX512F-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vptest %ymm0, %ymm0
+; AVX512F-NEXT: cmovnel %ecx, %eax
; AVX512F-NEXT: retq
;
; AVX512POPCNT-LABEL: load_ctlz_undef_i1024:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: movq 80(%rdi), %rsi
-; AVX512POPCNT-NEXT: movq 64(%rdi), %rcx
-; AVX512POPCNT-NEXT: movq 72(%rdi), %rdx
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512POPCNT-NEXT: movq 88(%rdi), %r8
+; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm2, %ecx
+; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512POPCNT-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512POPCNT-NEXT: vmovd %xmm1, %r9d
-; AVX512POPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT: orq 120(%rdi), %r8
+; AVX512POPCNT-NEXT: vmovd %xmm1, %eax
; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT: orq 104(%rdi), %rdx
-; AVX512POPCNT-NEXT: orq %r8, %rdx
-; AVX512POPCNT-NEXT: orq 112(%rdi), %rsi
-; AVX512POPCNT-NEXT: orq 96(%rdi), %rcx
-; AVX512POPCNT-NEXT: orq %rsi, %rcx
-; AVX512POPCNT-NEXT: orq %rdx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %r9d, %eax
+; AVX512POPCNT-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT: cmovnel %ecx, %eax
; AVX512POPCNT-NEXT: retq
;
; AVX512VL-LABEL: load_ctlz_undef_i1024:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq 80(%rdi), %rsi
-; AVX512VL-NEXT: movq 64(%rdi), %rcx
-; AVX512VL-NEXT: movq 72(%rdi), %rdx
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512VL-NEXT: movq 88(%rdi), %r8
+; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm2, %ecx
+; AVX512VL-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512VL-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm1, %r9d
-; AVX512VL-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512VL-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vmovd %xmm1, %eax
; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT: orq 120(%rdi), %r8
-; AVX512VL-NEXT: orq 104(%rdi), %rdx
-; AVX512VL-NEXT: orq 112(%rdi), %rsi
-; AVX512VL-NEXT: orq %r8, %rdx
-; AVX512VL-NEXT: orq 96(%rdi), %rcx
-; AVX512VL-NEXT: orq %rsi, %rcx
-; AVX512VL-NEXT: orq %rdx, %rcx
-; AVX512VL-NEXT: cmovnel %r9d, %eax
+; AVX512VL-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vptest %ymm0, %ymm0
+; AVX512VL-NEXT: cmovnel %ecx, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VLPOPCNT-LABEL: load_ctlz_undef_i1024:
; AVX512VLPOPCNT: # %bb.0:
-; AVX512VLPOPCNT-NEXT: movq 80(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: movq 64(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: movq 72(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT: vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: movq 88(%rdi), %r8
+; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT: vpermq %zmm0, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vplzcntq %zmm2, %zmm3
+; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm2, %zmm2, %k1
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm2, %ecx
+; AVX512VLPOPCNT-NEXT: vpermq (%rdi), %zmm1, %zmm1
; AVX512VLPOPCNT-NEXT: vplzcntq %zmm1, %zmm2
-; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %r9d
-; AVX512VLPOPCNT-NEXT: vpermq (%rdi), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT: vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %eax
; AVX512VLPOPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT: orq 120(%rdi), %r8
-; AVX512VLPOPCNT-NEXT: orq 104(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: orq 112(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: orq %r8, %rdx
-; AVX512VLPOPCNT-NEXT: orq 96(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: orq %rsi, %rcx
-; AVX512VLPOPCNT-NEXT: orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT: cmovnel %r9d, %eax
+; AVX512VLPOPCNT-NEXT: vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: cmovnel %ecx, %eax
; AVX512VLPOPCNT-NEXT: vzeroupper
; AVX512VLPOPCNT-NEXT: retq
%a0 = load i1024, ptr %p0
@@ -5452,109 +5060,92 @@ define i32 @test_cttz_i512(i512 %a0) nounwind {
define i32 @load_cttz_i512(ptr %p0) nounwind {
; SSE-LABEL: load_cttz_i512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 48(%rdi), %r10
-; SSE-NEXT: movq 40(%rdi), %r9
-; SSE-NEXT: movq 24(%rdi), %r8
-; SSE-NEXT: movq 16(%rdi), %rdx
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %rsi
-; SSE-NEXT: rep bsfq %rcx, %rax
-; SSE-NEXT: rep bsfq %rsi, %rbx
-; SSE-NEXT: addl $64, %ebx
-; SSE-NEXT: testq %rcx, %rcx
-; SSE-NEXT: cmovnel %eax, %ebx
-; SSE-NEXT: rep bsfq %rdx, %rax
-; SSE-NEXT: rep bsfq %r8, %r11
-; SSE-NEXT: addl $64, %r11d
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %r11d
-; SSE-NEXT: movq 32(%rdi), %r14
-; SSE-NEXT: subl $-128, %r11d
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: orq %rsi, %rax
-; SSE-NEXT: cmovnel %ebx, %r11d
-; SSE-NEXT: rep bsfq %r14, %rax
-; SSE-NEXT: rep bsfq %r9, %rbx
-; SSE-NEXT: addl $64, %ebx
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %eax, %ebx
-; SSE-NEXT: rep bsfq %r10, %r15
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movq 48(%rdi), %rdx
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: movq (%rdi), %rax
+; SSE-NEXT: movq 8(%rdi), %r8
+; SSE-NEXT: rep bsfq %rax, %rsi
+; SSE-NEXT: rep bsfq %r8, %r9
+; SSE-NEXT: addl $64, %r9d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %esi, %r9d
+; SSE-NEXT: movq 16(%rdi), %r10
+; SSE-NEXT: rep bsfq %r10, %r11
+; SSE-NEXT: rep bsfq 24(%rdi), %rsi
+; SSE-NEXT: addl $64, %esi
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: cmovnel %r9d, %esi
+; SSE-NEXT: movq 32(%rdi), %r8
+; SSE-NEXT: rep bsfq %r8, %rax
+; SSE-NEXT: rep bsfq %rcx, %r9
+; SSE-NEXT: addl $64, %r9d
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %eax, %r9d
+; SSE-NEXT: rep bsfq %rdx, %r10
; SSE-NEXT: movl $64, %eax
; SSE-NEXT: rep bsfq 56(%rdi), %rax
; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %r10d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r9, %r14
-; SSE-NEXT: cmovnel %ebx, %eax
+; SSE-NEXT: orq %rcx, %r8
+; SSE-NEXT: cmovnel %r9d, %eax
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r8, %rsi
-; SSE-NEXT: orq %rdx, %rcx
-; SSE-NEXT: orq %rsi, %rcx
-; SSE-NEXT: cmovnel %r11d, %eax
+; SSE-NEXT: por 16(%rdi), %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %esi, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
-; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
; SSE-NEXT: retq
;
; AVX2-LABEL: load_cttz_i512:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 48(%rdi), %r10
-; AVX2-NEXT: movq 40(%rdi), %r9
-; AVX2-NEXT: movq 24(%rdi), %r8
-; AVX2-NEXT: movq 16(%rdi), %rdx
-; AVX2-NEXT: movq (%rdi), %rcx
-; AVX2-NEXT: movq 8(%rdi), %rsi
-; AVX2-NEXT: tzcntq %rcx, %rax
-; AVX2-NEXT: xorl %ebx, %ebx
-; AVX2-NEXT: tzcntq %rsi, %rbx
-; AVX2-NEXT: addl $64, %ebx
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: movq 40(%rdi), %rcx
+; AVX2-NEXT: movq 32(%rdi), %rdx
+; AVX2-NEXT: movq 16(%rdi), %rax
+; AVX2-NEXT: movq (%rdi), %r8
+; AVX2-NEXT: movq 8(%rdi), %r9
+; AVX2-NEXT: tzcntq %r8, %rsi
+; AVX2-NEXT: tzcntq %r9, %r10
+; AVX2-NEXT: addl $64, %r10d
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %esi, %r10d
+; AVX2-NEXT: tzcntq %rax, %r11
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: tzcntq 24(%rdi), %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq %rdx, %rax
-; AVX2-NEXT: tzcntq %r8, %r11
-; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %r11d
-; AVX2-NEXT: subl $-128, %r11d
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: orq %rsi, %rax
-; AVX2-NEXT: cmovnel %ebx, %r11d
-; AVX2-NEXT: movq 32(%rdi), %rbx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rbx, %rax
-; AVX2-NEXT: xorl %r14d, %r14d
-; AVX2-NEXT: tzcntq %r9, %r14
-; AVX2-NEXT: addl $64, %r14d
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %eax, %r14d
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %r10, %r15
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 48(%rdi), %r9
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: tzcntq %r9, %r10
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq 56(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %r9, %rbx
-; AVX2-NEXT: cmovnel %r14d, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: orq %rsi, %rcx
-; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: vpor 16(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: cmovnel %esi, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cttz_i512:
@@ -5786,97 +5377,91 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
; SSE-NEXT: pushq %r13
; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq %r9, %r13
-; SSE-NEXT: movq %r8, %r14
-; SSE-NEXT: movq %rcx, %rbx
-; SSE-NEXT: movq %rdx, %r10
-; SSE-NEXT: movq %rsi, %r9
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT: movq %r9, %r10
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT: rep bsfq %rdi, %r11
+; SSE-NEXT: rep bsfq %rsi, %rbx
+; SSE-NEXT: addl $64, %ebx
+; SSE-NEXT: testq %rdi, %rdi
+; SSE-NEXT: cmovnel %r11d, %ebx
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: rep bsfq %rdi, %rax
-; SSE-NEXT: rep bsfq %r9, %r15
+; SSE-NEXT: rep bsfq %rdx, %r12
+; SSE-NEXT: rep bsfq %rcx, %r15
; SSE-NEXT: addl $64, %r15d
-; SSE-NEXT: testq %rdi, %rdi
-; SSE-NEXT: cmovnel %eax, %r15d
-; SSE-NEXT: rep bsfq %r10, %r12
-; SSE-NEXT: rep bsfq %rcx, %rax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %r12d, %eax
-; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %r12d, %r15d
+; SSE-NEXT: subl $-128, %r15d
; SSE-NEXT: movq %rdi, %r12
-; SSE-NEXT: orq %r9, %r12
-; SSE-NEXT: cmovnel %r15d, %eax
-; SSE-NEXT: rep bsfq %r8, %r15
-; SSE-NEXT: movq %r13, %rcx
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: rep bsfq %r13, %r13
+; SSE-NEXT: orq %rsi, %r12
+; SSE-NEXT: cmovnel %ebx, %r15d
+; SSE-NEXT: rep bsfq %r8, %rbx
+; SSE-NEXT: rep bsfq %r10, %r13
; SSE-NEXT: addl $64, %r13d
; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %ebx, %r13d
+; SSE-NEXT: rep bsfq %r9, %r12
+; SSE-NEXT: rep bsfq %r11, %rbx
+; SSE-NEXT: addl $64, %ebx
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r12d, %ebx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT: subl $-128, %ebx
+; SSE-NEXT: movq %r8, %rbp
+; SSE-NEXT: orq %r10, %rbp
+; SSE-NEXT: cmovnel %r13d, %ebx
+; SSE-NEXT: addl $256, %ebx # imm = 0x100
+; SSE-NEXT: movq %rsi, %r13
+; SSE-NEXT: orq %rcx, %r13
+; SSE-NEXT: movq %rdi, %rbp
+; SSE-NEXT: orq %rdx, %rbp
+; SSE-NEXT: orq %r13, %rbp
+; SSE-NEXT: cmovnel %r15d, %ebx
+; SSE-NEXT: rep bsfq %rax, %r15
+; SSE-NEXT: rep bsfq %r12, %r13
+; SSE-NEXT: addl $64, %r13d
+; SSE-NEXT: testq %rax, %rax
; SSE-NEXT: cmovnel %r15d, %r13d
-; SSE-NEXT: rep bsfq %rdx, %r12
+; SSE-NEXT: rep bsfq %r14, %rbp
; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r15
; SSE-NEXT: addl $64, %r15d
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %r12d, %r15d
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT: testq %r14, %r14
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT: cmovnel %ebp, %r15d
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
; SSE-NEXT: subl $-128, %r15d
-; SSE-NEXT: movq %r8, %rbp
-; SSE-NEXT: orq %rcx, %rbp
+; SSE-NEXT: orq %r12, %rax
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r12
; SSE-NEXT: cmovnel %r13d, %r15d
-; SSE-NEXT: addl $256, %r15d # imm = 0x100
-; SSE-NEXT: movq %r9, %r13
-; SSE-NEXT: orq %rbx, %r13
-; SSE-NEXT: movq %rdi, %rbp
-; SSE-NEXT: orq %r10, %rbp
-; SSE-NEXT: orq %r13, %rbp
-; SSE-NEXT: cmovnel %eax, %r15d
-; SSE-NEXT: rep bsfq %r11, %r13
-; SSE-NEXT: rep bsfq %r12, %rax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %r13d, %eax
-; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT: rep bsfq %r14, %rax
+; SSE-NEXT: rep bsfq %r12, %r13
; SSE-NEXT: addl $64, %r13d
-; SSE-NEXT: rep bsfq %rsi, %rcx
-; SSE-NEXT: testq %rsi, %rsi
-; SSE-NEXT: cmovnel %ecx, %r13d
-; SSE-NEXT: subl $-128, %r13d
-; SSE-NEXT: movq %r11, %rcx
-; SSE-NEXT: orq %r12, %rcx
+; SSE-NEXT: testq %r14, %r14
; SSE-NEXT: cmovnel %eax, %r13d
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE-NEXT: rep bsfq %rbp, %rcx
-; SSE-NEXT: addl $64, %ecx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: rep bsfq %rdx, %rax
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %ecx
; SSE-NEXT: movl $64, %eax
; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: rep bsfq %r8, %rsi
-; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: rep bsfq %r9, %rbp
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %ebp, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %rbp, %rdx
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r12
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r12, %r11
+; SSE-NEXT: orq %r12, %r14
; SSE-NEXT: cmovnel %r13d, %eax
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; SSE-NEXT: orq %rbx, %r9
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: orq %r14, %rdi
-; SSE-NEXT: orq %r10, %rdi
-; SSE-NEXT: addl $512, %eax # imm = 0x200
-; SSE-NEXT: orq %r9, %rdi
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: addl $256, %eax # imm = 0x100
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: orq %r11, %rcx
+; SSE-NEXT: orq %r10, %rsi
+; SSE-NEXT: orq %rcx, %rsi
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT: orq %r8, %rdi
+; SSE-NEXT: orq %rdx, %rdi
+; SSE-NEXT: addl $512, %eax # imm = 0x200
+; SSE-NEXT: orq %rsi, %rdi
+; SSE-NEXT: cmovnel %ebx, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r12
@@ -5894,111 +5479,108 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq %r9, %rbx
-; AVX2-NEXT: movq %r8, %r14
-; AVX2-NEXT: movq %rcx, %r11
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movq %rsi, %r9
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: movq %r9, %r10
+; AVX2-NEXT: movq %r8, %r9
+; AVX2-NEXT: movq %rcx, %r8
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: tzcntq %rdi, %rax
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %r9, %r15
-; AVX2-NEXT: addl $64, %r15d
-; AVX2-NEXT: testq %rdi, %rdi
-; AVX2-NEXT: cmovnel %eax, %r15d
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %r10, %r12
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %r11, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %r12d, %eax
-; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: movq %rdi, %r12
-; AVX2-NEXT: orq %r9, %r12
-; AVX2-NEXT: cmovnel %r15d, %eax
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %r14, %r15
-; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rdi, %rbx
; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %rbx, %r12
+; AVX2-NEXT: tzcntq %rsi, %r12
; AVX2-NEXT: addl $64, %r12d
-; AVX2-NEXT: testq %r14, %r14
-; AVX2-NEXT: cmovnel %r15d, %r12d
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: tzcntq %rcx, %r13
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %ebx, %r12d
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rdx, %rbx
; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %rdx, %r15
+; AVX2-NEXT: tzcntq %r8, %r15
; AVX2-NEXT: addl $64, %r15d
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: cmovnel %r13d, %r15d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %ebx, %r15d
; AVX2-NEXT: subl $-128, %r15d
-; AVX2-NEXT: movq %r14, %r13
-; AVX2-NEXT: orq %rbx, %r13
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: orq %rsi, %rbx
; AVX2-NEXT: cmovnel %r12d, %r15d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: addl $256, %r15d # imm = 0x100
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r9, %rbx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %r10, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %ebx, %r12d
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %r11, %r13
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rcx, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %r13d, %ebx
+; AVX2-NEXT: subl $-128, %ebx
; AVX2-NEXT: movq %r9, %r13
-; AVX2-NEXT: orq %r11, %r13
-; AVX2-NEXT: movq %rdi, %rbp
-; AVX2-NEXT: orq %r10, %rbp
-; AVX2-NEXT: orq %r13, %rbp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: cmovnel %eax, %r15d
-; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: tzcntq %r12, %rbp
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %r13, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r12, %r12
-; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: orq %r10, %r13
+; AVX2-NEXT: cmovnel %r12d, %ebx
+; AVX2-NEXT: addl $256, %ebx # imm = 0x100
+; AVX2-NEXT: movq %rsi, %r12
+; AVX2-NEXT: orq %r8, %r12
+; AVX2-NEXT: movq %rdi, %r13
+; AVX2-NEXT: orq %rdx, %r13
+; AVX2-NEXT: orq %r12, %r13
+; AVX2-NEXT: cmovnel %r15d, %ebx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq %rax, %r15
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %r12, %r13
+; AVX2-NEXT: addl $64, %r13d
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r15d, %r13d
; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: tzcntq %r8, %rbp
-; AVX2-NEXT: addl $64, %ebp
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %rsi, %rcx
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %ecx, %ebp
-; AVX2-NEXT: subl $-128, %ebp
-; AVX2-NEXT: movq %r12, %rcx
-; AVX2-NEXT: orq %r13, %rcx
-; AVX2-NEXT: cmovnel %eax, %ebp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %rbx, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: tzcntq %r14, %rbp
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %r14, %r14
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: cmovnel %ebp, %r15d
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX2-NEXT: subl $-128, %r15d
+; AVX2-NEXT: orq %r12, %rax
+; AVX2-NEXT: cmovnel %r13d, %r15d
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rdx, %rax
-; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %ecx
+; AVX2-NEXT: tzcntq %rbp, %rax
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %r14, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %rbp, %rbp
+; AVX2-NEXT: cmovnel %eax, %r12d
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: tzcntq %r8, %rsi
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %esi, %eax
-; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %rbx, %rdx
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: tzcntq %r13, %rcx
+; AVX2-NEXT: testq %r13, %r13
; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %r14, %rbp
+; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r13, %r12
-; AVX2-NEXT: cmovnel %ebp, %eax
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: orq %r14, %rdi
-; AVX2-NEXT: orq %r10, %rdi
-; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r9, %rdi
+; AVX2-NEXT: vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT: vptest %xmm0, %xmm0
; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT: orq %r10, %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: orq %r11, %rdx
+; AVX2-NEXT: orq %r9, %rdi
+; AVX2-NEXT: orq %rdx, %rdi
+; AVX2-NEXT: addl $512, %eax # imm = 0x200
+; AVX2-NEXT: orq %rsi, %rdi
+; AVX2-NEXT: cmovnel %ebx, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
@@ -6189,389 +5771,301 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
define i32 @load_cttz_i1024(ptr %p0) nounwind {
; SSE-LABEL: load_cttz_i1024:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 88(%rdi), %r10
-; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 56(%rdi), %rcx
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 40(%rdi), %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 24(%rdi), %r9
-; SSE-NEXT: movq 16(%rdi), %r15
-; SSE-NEXT: movq (%rdi), %r8
-; SSE-NEXT: movq 8(%rdi), %r11
-; SSE-NEXT: rep bsfq %r8, %rax
+; SSE-NEXT: movq 112(%rdi), %rcx
+; SSE-NEXT: movq 48(%rdi), %rsi
+; SSE-NEXT: movq (%rdi), %rdx
+; SSE-NEXT: movq 8(%rdi), %r8
+; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: rep bsfq %r8, %r9
+; SSE-NEXT: addl $64, %r9d
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %eax, %r9d
+; SSE-NEXT: movq 16(%rdi), %r10
+; SSE-NEXT: rep bsfq %r10, %r11
+; SSE-NEXT: rep bsfq 24(%rdi), %rax
+; SSE-NEXT: addl $64, %eax
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %r11d, %eax
+; SSE-NEXT: movq 40(%rdi), %r10
+; SSE-NEXT: movq 32(%rdi), %r11
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %r8, %rdx
+; SSE-NEXT: cmovnel %r9d, %eax
; SSE-NEXT: rep bsfq %r11, %rdx
+; SSE-NEXT: rep bsfq %r10, %rbx
+; SSE-NEXT: addl $64, %ebx
+; SSE-NEXT: testq %r11, %r11
+; SSE-NEXT: cmovnel %edx, %ebx
+; SSE-NEXT: rep bsfq %rsi, %r9
+; SSE-NEXT: rep bsfq 56(%rdi), %rdx
; SSE-NEXT: addl $64, %edx
-; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: movq 72(%rdi), %r8
+; SSE-NEXT: cmovnel %r9d, %edx
+; SSE-NEXT: movq 64(%rdi), %r9
+; SSE-NEXT: movdqa 16(%rdi), %xmm0
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: subl $-128, %edx
+; SSE-NEXT: orq %r10, %r11
+; SSE-NEXT: cmovnel %ebx, %edx
+; SSE-NEXT: addl $256, %edx # imm = 0x100
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
; SSE-NEXT: cmovnel %eax, %edx
-; SSE-NEXT: rep bsfq %r15, %rbx
; SSE-NEXT: rep bsfq %r9, %rax
-; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r15, %r15
-; SSE-NEXT: cmovnel %ebx, %eax
-; SSE-NEXT: movq 32(%rdi), %rbx
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: movq %r8, %r14
-; SSE-NEXT: orq %r11, %r14
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: rep bsfq %rbx, %rdx
-; SSE-NEXT: rep bsfq %rsi, %r12
-; SSE-NEXT: addl $64, %r12d
+; SSE-NEXT: rep bsfq %r8, %r11
+; SSE-NEXT: addl $64, %r11d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %eax, %r11d
+; SSE-NEXT: movq 80(%rdi), %rax
+; SSE-NEXT: rep bsfq %rax, %r10
+; SSE-NEXT: rep bsfq 88(%rdi), %rsi
+; SSE-NEXT: addl $64, %esi
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %r10d, %esi
+; SSE-NEXT: movq 104(%rdi), %r10
+; SSE-NEXT: movq 96(%rdi), %rbx
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r8, %r9
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: rep bsfq %rbx, %rax
+; SSE-NEXT: rep bsfq %r10, %r8
+; SSE-NEXT: addl $64, %r8d
; SSE-NEXT: testq %rbx, %rbx
-; SSE-NEXT: cmovnel %edx, %r12d
-; SSE-NEXT: movq 48(%rdi), %r13
-; SSE-NEXT: movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: rep bsfq %r13, %rdx
-; SSE-NEXT: rep bsfq %rcx, %r14
-; SSE-NEXT: addl $64, %r14d
-; SSE-NEXT: testq %r13, %r13
-; SSE-NEXT: cmovnel %edx, %r14d
-; SSE-NEXT: subl $-128, %r14d
-; SSE-NEXT: movq %rbx, %rdx
-; SSE-NEXT: orq %rsi, %rdx
-; SSE-NEXT: cmovnel %r12d, %r14d
-; SSE-NEXT: movq 72(%rdi), %r12
-; SSE-NEXT: addl $256, %r14d # imm = 0x100
-; SSE-NEXT: movq %r11, %rdx
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: movq %r8, %r13
-; SSE-NEXT: orq %r15, %r13
-; SSE-NEXT: orq %rdx, %r13
-; SSE-NEXT: movq 64(%rdi), %r13
-; SSE-NEXT: cmovnel %eax, %r14d
-; SSE-NEXT: rep bsfq %r13, %rdx
-; SSE-NEXT: rep bsfq %r12, %rax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r13, %r13
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: rep bsfq %r10, %rbp
-; SSE-NEXT: addl $64, %ebp
-; SSE-NEXT: movq 80(%rdi), %r10
-; SSE-NEXT: rep bsfq %r10, %rcx
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %ecx, %ebp
-; SSE-NEXT: subl $-128, %ebp
-; SSE-NEXT: movq %r13, %rcx
-; SSE-NEXT: orq %r12, %rcx
-; SSE-NEXT: cmovnel %eax, %ebp
-; SSE-NEXT: movq 104(%rdi), %r9
-; SSE-NEXT: rep bsfq %r9, %rcx
-; SSE-NEXT: addl $64, %ecx
-; SSE-NEXT: movq 96(%rdi), %rdx
-; SSE-NEXT: rep bsfq %rdx, %rax
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: rep bsfq %rcx, %r9
; SSE-NEXT: movl $64, %eax
; SSE-NEXT: rep bsfq 120(%rdi), %rax
-; SSE-NEXT: movq 112(%rdi), %rdi
; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: rep bsfq %rdi, %rsi
-; SSE-NEXT: testq %rdi, %rdi
-; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: testq %rcx, %rcx
+; SSE-NEXT: cmovnel %r9d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT: orq %r10, %r13
+; SSE-NEXT: orq %r10, %rbx
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: movdqa 64(%rdi), %xmm2
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r12, %r13
-; SSE-NEXT: cmovnel %ebp, %eax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT: orq %rcx, %r11
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT: orq %rbx, %r8
-; SSE-NEXT: orq %r15, %r8
+; SSE-NEXT: por 80(%rdi), %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: por 48(%rdi), %xmm0
+; SSE-NEXT: por 32(%rdi), %xmm1
; SSE-NEXT: addl $512, %eax # imm = 0x200
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: cmovnel %r14d, %eax
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edx, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: load_cttz_i1024:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 72(%rdi), %r14
-; AVX2-NEXT: movq 64(%rdi), %r15
-; AVX2-NEXT: movq 56(%rdi), %r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %rcx
-; AVX2-NEXT: movq 40(%rdi), %r10
-; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 32(%rdi), %rsi
-; AVX2-NEXT: movq 24(%rdi), %rbp
-; AVX2-NEXT: movq 16(%rdi), %rbx
+; AVX2-NEXT: movq 104(%rdi), %rcx
+; AVX2-NEXT: movq 48(%rdi), %rsi
+; AVX2-NEXT: movq 16(%rdi), %rdx
; AVX2-NEXT: movq (%rdi), %r8
-; AVX2-NEXT: movq 8(%rdi), %r11
+; AVX2-NEXT: movq 8(%rdi), %r9
; AVX2-NEXT: tzcntq %r8, %rax
-; AVX2-NEXT: tzcntq %r11, %rdx
-; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r9, %rbx
+; AVX2-NEXT: addl $64, %ebx
; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %eax, %edx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %rbx, %r12
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: tzcntq %rdx, %r11
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rbp, %rax
-; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: tzcntq 24(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: movq 40(%rdi), %r10
+; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: movq 32(%rdi), %r11
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: movq %r8, %r12
-; AVX2-NEXT: orq %r11, %r12
-; AVX2-NEXT: cmovnel %edx, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: tzcntq %rsi, %rdx
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: tzcntq %r10, %r13
-; AVX2-NEXT: addl $64, %r13d
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: cmovnel %edx, %r13d
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %ebx, %eax
; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: tzcntq %rcx, %rdx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %r9, %r12
-; AVX2-NEXT: addl $64, %r12d
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: cmovnel %edx, %r12d
-; AVX2-NEXT: subl $-128, %r12d
-; AVX2-NEXT: movq %rsi, %rdx
-; AVX2-NEXT: orq %r10, %rdx
-; AVX2-NEXT: cmovnel %r13d, %r12d
-; AVX2-NEXT: addl $256, %r12d # imm = 0x100
-; AVX2-NEXT: movq %r11, %rdx
-; AVX2-NEXT: orq %rbp, %rdx
-; AVX2-NEXT: movq %r8, %r13
-; AVX2-NEXT: orq %rbx, %r13
-; AVX2-NEXT: orq %rdx, %r13
-; AVX2-NEXT: cmovnel %eax, %r12d
+; AVX2-NEXT: tzcntq %r11, %rdx
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r10, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %edx, %ebx
; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: tzcntq %r15, %rdx
+; AVX2-NEXT: tzcntq 56(%rdi), %rdx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rsi, %r8
+; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: movq 80(%rdi), %r9
+; AVX2-NEXT: cmovnel %r8d, %edx
+; AVX2-NEXT: movq 72(%rdi), %r8
+; AVX2-NEXT: subl $-128, %edx
+; AVX2-NEXT: orq %r10, %r11
+; AVX2-NEXT: movq 64(%rdi), %r10
+; AVX2-NEXT: cmovnel %ebx, %edx
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: addl $256, %edx # imm = 0x100
+; AVX2-NEXT: vpor 16(%rdi), %xmm0, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %eax, %edx
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %r14, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r15, %r15
-; AVX2-NEXT: cmovnel %edx, %eax
-; AVX2-NEXT: movq 88(%rdi), %rbp
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: tzcntq %rbp, %r13
-; AVX2-NEXT: addl $64, %r13d
-; AVX2-NEXT: movq 80(%rdi), %r10
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %r10, %rcx
+; AVX2-NEXT: tzcntq %r10, %rax
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: tzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %ecx, %r13d
-; AVX2-NEXT: subl $-128, %r13d
-; AVX2-NEXT: movq %r15, %rcx
-; AVX2-NEXT: orq %r14, %rcx
-; AVX2-NEXT: cmovnel %eax, %r13d
-; AVX2-NEXT: movq 104(%rdi), %r9
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %r9, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: movq 96(%rdi), %rdx
+; AVX2-NEXT: cmovnel %eax, %r11d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: tzcntq 88(%rdi), %rsi
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rdx, %rax
-; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: movq 112(%rdi), %rsi
+; AVX2-NEXT: tzcntq %r9, %rax
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: movq 96(%rdi), %r9
+; AVX2-NEXT: cmovnel %eax, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r8, %r10
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r9, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 112(%rdi), %r10
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: tzcntq %r10, %r11
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq 120(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: tzcntq %rsi, %rdi
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %r11d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %r9, %rdx
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq %rbp, %r14
-; AVX2-NEXT: orq %r10, %r15
+; AVX2-NEXT: orq %rcx, %r9
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r14, %r15
-; AVX2-NEXT: cmovnel %r13d, %eax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rcx, %r11
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbx, %r8
+; AVX2-NEXT: vpor 80(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: vpor 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %edx, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cttz_i1024:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512F-NEXT: movq 16(%rdi), %rax
-; AVX512F-NEXT: movq (%rdi), %rcx
-; AVX512F-NEXT: movq 8(%rdi), %rdx
-; AVX512F-NEXT: movq 24(%rdi), %rsi
-; AVX512F-NEXT: orq 56(%rdi), %rsi
-; AVX512F-NEXT: orq 40(%rdi), %rdx
-; AVX512F-NEXT: orq 48(%rdi), %rax
-; AVX512F-NEXT: orq 32(%rdi), %rcx
-; AVX512F-NEXT: orq %rsi, %rdx
-; AVX512F-NEXT: orq %rax, %rcx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512F-NEXT: vplzcntq %zmm3, %zmm3
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3
-; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vmovd %xmm1, %esi
-; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
-; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm3, %ecx
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT: vplzcntq %zmm2, %zmm2
+; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: addl $512, %eax # imm = 0x200
-; AVX512F-NEXT: orq %rdx, %rcx
-; AVX512F-NEXT: cmovnel %esi, %eax
+; AVX512F-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vptest %ymm0, %ymm0
+; AVX512F-NEXT: cmovnel %ecx, %eax
; AVX512F-NEXT: retq
;
; AVX512POPCNT-LABEL: load_cttz_i1024:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512POPCNT-NEXT: movq 16(%rdi), %rax
-; AVX512POPCNT-NEXT: movq (%rdi), %rcx
-; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx
-; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi
-; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi
-; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx
-; AVX512POPCNT-NEXT: orq 48(%rdi), %rax
-; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx
-; AVX512POPCNT-NEXT: orq %rsi, %rdx
-; AVX512POPCNT-NEXT: orq %rax, %rcx
+; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3
; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
-; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT: vmovd %xmm1, %esi
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1
; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm3, %ecx
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vpopcntq %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %eax
; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT: orq %rdx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %esi, %eax
+; AVX512POPCNT-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT: cmovnel %ecx, %eax
; AVX512POPCNT-NEXT: retq
;
; AVX512VL-LABEL: load_cttz_i1024:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512VL-NEXT: movq 16(%rdi), %rax
-; AVX512VL-NEXT: movq (%rdi), %rcx
-; AVX512VL-NEXT: movq 8(%rdi), %rdx
-; AVX512VL-NEXT: movq 24(%rdi), %rsi
-; AVX512VL-NEXT: orq 56(%rdi), %rsi
-; AVX512VL-NEXT: orq 40(%rdi), %rdx
-; AVX512VL-NEXT: orq %rsi, %rdx
-; AVX512VL-NEXT: orq 48(%rdi), %rax
-; AVX512VL-NEXT: orq 32(%rdi), %rcx
-; AVX512VL-NEXT: orq %rax, %rcx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VL-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3
-; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512VL-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm1, %esi
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
-; AVX512VL-NEXT: vpsubq %zmm1, %zmm4, %zmm1
; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm3, %ecx
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vplzcntq %zmm2, %zmm2
+; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VL-NEXT: vmovd %xmm1, %eax
; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT: orq %rdx, %rcx
-; AVX512VL-NEXT: cmovnel %esi, %eax
+; AVX512VL-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vptest %ymm0, %ymm0
+; AVX512VL-NEXT: cmovnel %ecx, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VLPOPCNT-LABEL: load_cttz_i1024:
; AVX512VLPOPCNT: # %bb.0:
-; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512VLPOPCNT-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512VLPOPCNT-NEXT: movq 16(%rdi), %rax
-; AVX512VLPOPCNT-NEXT: movq (%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: movq 8(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: movq 24(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: orq 56(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: orq 40(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: orq %rsi, %rdx
-; AVX512VLPOPCNT-NEXT: orq 48(%rdi), %rax
-; AVX512VLPOPCNT-NEXT: orq 32(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: orq %rax, %rcx
+; AVX512VLPOPCNT-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512VLPOPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VLPOPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VLPOPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512VLPOPCNT-NEXT: vpopcntq %zmm3, %zmm3
; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %esi
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpopcntq %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1
; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm3, %ecx
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vpopcntq %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT: vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %eax
; AVX512VLPOPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT: orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT: cmovnel %esi, %eax
+; AVX512VLPOPCNT-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: cmovnel %ecx, %eax
; AVX512VLPOPCNT-NEXT: vzeroupper
; AVX512VLPOPCNT-NEXT: retq
%a0 = load i1024, ptr %p0
@@ -7177,108 +6671,93 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
; SSE-LABEL: load_cttz_undef_i512:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 40(%rdi), %r9
-; SSE-NEXT: movq 24(%rdi), %r8
-; SSE-NEXT: movq 16(%rdi), %rdx
-; SSE-NEXT: movq (%rdi), %rcx
-; SSE-NEXT: movq 8(%rdi), %rsi
-; SSE-NEXT: rep bsfq %rcx, %rax
-; SSE-NEXT: rep bsfq %rsi, %r11
+; SSE-NEXT: movdqa (%rdi), %xmm0
+; SSE-NEXT: movq 48(%rdi), %rsi
+; SSE-NEXT: movq 40(%rdi), %rcx
+; SSE-NEXT: movq 32(%rdi), %rdx
+; SSE-NEXT: movq 16(%rdi), %rax
+; SSE-NEXT: movq (%rdi), %r9
+; SSE-NEXT: movq 8(%rdi), %r10
+; SSE-NEXT: rep bsfq %r9, %r8
+; SSE-NEXT: rep bsfq %r10, %r11
; SSE-NEXT: addl $64, %r11d
-; SSE-NEXT: testq %rcx, %rcx
-; SSE-NEXT: cmovnel %eax, %r11d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %r8d, %r11d
+; SSE-NEXT: rep bsfq %rax, %rbx
+; SSE-NEXT: rep bsfq 24(%rdi), %r8
+; SSE-NEXT: addl $64, %r8d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %ebx, %r8d
+; SSE-NEXT: subl $-128, %r8d
+; SSE-NEXT: orq %r10, %r9
+; SSE-NEXT: cmovnel %r11d, %r8d
; SSE-NEXT: rep bsfq %rdx, %rax
-; SSE-NEXT: rep bsfq %r8, %r10
-; SSE-NEXT: addl $64, %r10d
+; SSE-NEXT: rep bsfq %rcx, %r9
+; SSE-NEXT: addl $64, %r9d
; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %r10d
-; SSE-NEXT: movq 32(%rdi), %rbx
-; SSE-NEXT: subl $-128, %r10d
-; SSE-NEXT: movq %rcx, %rax
-; SSE-NEXT: orq %rsi, %rax
-; SSE-NEXT: cmovnel %r11d, %r10d
-; SSE-NEXT: rep bsfq %rbx, %rax
-; SSE-NEXT: rep bsfq %r9, %r11
-; SSE-NEXT: addl $64, %r11d
-; SSE-NEXT: testq %rbx, %rbx
-; SSE-NEXT: cmovnel %eax, %r11d
-; SSE-NEXT: movq 48(%rdi), %r14
-; SSE-NEXT: rep bsfq %r14, %r15
+; SSE-NEXT: cmovnel %eax, %r9d
+; SSE-NEXT: rep bsfq %rsi, %r10
; SSE-NEXT: rep bsfq 56(%rdi), %rax
; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: testq %rsi, %rsi
+; SSE-NEXT: cmovnel %r10d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r9, %rbx
-; SSE-NEXT: cmovnel %r11d, %eax
+; SSE-NEXT: orq %rcx, %rdx
+; SSE-NEXT: cmovnel %r9d, %eax
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r8, %rsi
-; SSE-NEXT: orq %rdx, %rcx
-; SSE-NEXT: orq %rsi, %rcx
-; SSE-NEXT: cmovnel %r10d, %eax
+; SSE-NEXT: por 16(%rdi), %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %r8d, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
; SSE-NEXT: retq
;
; AVX2-LABEL: load_cttz_undef_i512:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 48(%rdi), %r10
-; AVX2-NEXT: movq 40(%rdi), %r9
-; AVX2-NEXT: movq 24(%rdi), %r8
-; AVX2-NEXT: movq 16(%rdi), %rdx
-; AVX2-NEXT: movq (%rdi), %rcx
-; AVX2-NEXT: movq 8(%rdi), %rsi
-; AVX2-NEXT: tzcntq %rcx, %rax
-; AVX2-NEXT: xorl %ebx, %ebx
-; AVX2-NEXT: tzcntq %rsi, %rbx
-; AVX2-NEXT: addl $64, %ebx
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: movq 40(%rdi), %rcx
+; AVX2-NEXT: movq 32(%rdi), %rdx
+; AVX2-NEXT: movq 16(%rdi), %rax
+; AVX2-NEXT: movq (%rdi), %r8
+; AVX2-NEXT: movq 8(%rdi), %r9
+; AVX2-NEXT: tzcntq %r8, %rsi
+; AVX2-NEXT: tzcntq %r9, %r10
+; AVX2-NEXT: addl $64, %r10d
+; AVX2-NEXT: testq %r8, %r8
+; AVX2-NEXT: cmovnel %esi, %r10d
+; AVX2-NEXT: tzcntq %rax, %r11
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: tzcntq 24(%rdi), %rsi
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %r10d, %esi
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq %rdx, %rax
-; AVX2-NEXT: tzcntq %r8, %r11
-; AVX2-NEXT: addl $64, %r11d
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %r11d
-; AVX2-NEXT: subl $-128, %r11d
-; AVX2-NEXT: movq %rcx, %rax
-; AVX2-NEXT: orq %rsi, %rax
-; AVX2-NEXT: cmovnel %ebx, %r11d
-; AVX2-NEXT: movq 32(%rdi), %rbx
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rbx, %rax
-; AVX2-NEXT: xorl %r14d, %r14d
-; AVX2-NEXT: tzcntq %r9, %r14
-; AVX2-NEXT: addl $64, %r14d
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %eax, %r14d
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %r10, %r15
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 48(%rdi), %r9
+; AVX2-NEXT: xorl %r10d, %r10d
+; AVX2-NEXT: tzcntq %r9, %r10
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq 56(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %r10d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %r9, %rbx
-; AVX2-NEXT: cmovnel %r14d, %eax
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa (%rdi), %xmm0
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r8, %rsi
-; AVX2-NEXT: orq %rdx, %rcx
-; AVX2-NEXT: orq %rsi, %rcx
-; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: vpor 16(%rdi), %xmm0, %xmm0
+; AVX2-NEXT: vptest %xmm0, %xmm0
+; AVX2-NEXT: cmovnel %esi, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cttz_undef_i512:
@@ -7501,95 +6980,92 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
; SSE-NEXT: pushq %r13
; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq %r9, %r14
-; SSE-NEXT: movq %rcx, %rbx
-; SSE-NEXT: movq %rdx, %r10
-; SSE-NEXT: movq %rsi, %r9
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: movq %r9, %r10
+; SSE-NEXT: movq %r8, %r9
+; SSE-NEXT: movq %rcx, %r8
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT: rep bsfq %rdi, %rax
-; SSE-NEXT: rep bsfq %rsi, %r12
-; SSE-NEXT: addl $64, %r12d
+; SSE-NEXT: rep bsfq %rdi, %r11
+; SSE-NEXT: rep bsfq %rsi, %rbx
+; SSE-NEXT: addl $64, %ebx
; SSE-NEXT: testq %rdi, %rdi
-; SSE-NEXT: cmovnel %eax, %r12d
-; SSE-NEXT: rep bsfq %r10, %r15
-; SSE-NEXT: rep bsfq %rbx, %rax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %r15d, %eax
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: movq %rdi, %r13
-; SSE-NEXT: orq %rsi, %r13
-; SSE-NEXT: cmovnel %r12d, %eax
-; SSE-NEXT: movq %r8, %r15
-; SSE-NEXT: rep bsfq %r8, %r12
-; SSE-NEXT: movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: rep bsfq %r14, %r13
-; SSE-NEXT: addl $64, %r13d
-; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %r12d, %r13d
-; SSE-NEXT: rep bsfq %rcx, %rbp
-; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT: cmovnel %r11d, %ebx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: rep bsfq %rdx, %r15
+; SSE-NEXT: rep bsfq %r8, %r14
+; SSE-NEXT: addl $64, %r14d
+; SSE-NEXT: testq %rdx, %rdx
+; SSE-NEXT: cmovnel %r15d, %r14d
+; SSE-NEXT: subl $-128, %r14d
+; SSE-NEXT: movq %rdi, %r15
+; SSE-NEXT: orq %rsi, %r15
+; SSE-NEXT: cmovnel %ebx, %r14d
+; SSE-NEXT: rep bsfq %r9, %rbx
+; SSE-NEXT: rep bsfq %r10, %r12
; SSE-NEXT: addl $64, %r12d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %ebx, %r12d
+; SSE-NEXT: rep bsfq %rcx, %r15
+; SSE-NEXT: rep bsfq %r11, %rbx
+; SSE-NEXT: addl $64, %ebx
; SSE-NEXT: testq %rcx, %rcx
-; SSE-NEXT: cmovnel %ebp, %r12d
-; SSE-NEXT: subl $-128, %r12d
-; SSE-NEXT: movq %r8, %rbp
-; SSE-NEXT: orq %r14, %rbp
-; SSE-NEXT: cmovnel %r13d, %r12d
-; SSE-NEXT: addl $256, %r12d # imm = 0x100
-; SSE-NEXT: movq %rsi, %r13
-; SSE-NEXT: orq %rbx, %r13
-; SSE-NEXT: movq %rdi, %rbp
-; SSE-NEXT: orq %r10, %rbp
-; SSE-NEXT: orq %r13, %rbp
+; SSE-NEXT: cmovnel %r15d, %ebx
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT: subl $-128, %ebx
+; SSE-NEXT: movq %r9, %r13
+; SSE-NEXT: orq %r10, %r13
+; SSE-NEXT: cmovnel %r12d, %ebx
+; SSE-NEXT: addl $256, %ebx # imm = 0x100
+; SSE-NEXT: movq %rsi, %r12
+; SSE-NEXT: orq %r8, %r12
+; SSE-NEXT: movq %rdi, %r13
+; SSE-NEXT: orq %rdx, %r13
+; SSE-NEXT: orq %r12, %r13
+; SSE-NEXT: cmovnel %r14d, %ebx
+; SSE-NEXT: rep bsfq %rax, %r14
+; SSE-NEXT: rep bsfq %r15, %r13
+; SSE-NEXT: addl $64, %r13d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %r14d, %r13d
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT: rep bsfq %r12, %rbp
+; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT: addl $64, %r14d
+; SSE-NEXT: testq %r12, %r12
+; SSE-NEXT: cmovnel %ebp, %r14d
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; SSE-NEXT: subl $-128, %r14d
+; SSE-NEXT: orq %r15, %rax
+; SSE-NEXT: cmovnel %r13d, %r14d
+; SSE-NEXT: rep bsfq %r12, %rax
+; SSE-NEXT: rep bsfq %rbp, %r15
+; SSE-NEXT: addl $64, %r15d
+; SSE-NEXT: testq %r12, %r12
+; SSE-NEXT: cmovnel %eax, %r15d
; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT: cmovnel %eax, %r12d
-; SSE-NEXT: rep bsfq %r11, %rbp
-; SSE-NEXT: rep bsfq %r13, %rax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r11, %r11
-; SSE-NEXT: cmovnel %ebp, %eax
-; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rbp
-; SSE-NEXT: addl $64, %ebp
-; SSE-NEXT: rep bsfq %rdx, %rcx
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %ecx, %ebp
-; SSE-NEXT: subl $-128, %ebp
-; SSE-NEXT: movq %r11, %rcx
-; SSE-NEXT: orq %r13, %rcx
-; SSE-NEXT: cmovnel %eax, %ebp
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT: rep bsfq %r14, %rcx
-; SSE-NEXT: addl $64, %ecx
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT: rep bsfq %rdx, %rax
-; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %ecx
; SSE-NEXT: rep bsfq {{[0-9]+}}(%rsp), %rax
; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT: rep bsfq %r8, %rsi
-; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %esi, %eax
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r14, %rdx
+; SSE-NEXT: rep bsfq %r13, %rcx
+; SSE-NEXT: testq %r13, %r13
; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT: subl $-128, %eax
+; SSE-NEXT: orq %rbp, %r12
+; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: movdqa {{[0-9]+}}(%rsp), %xmm0
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %r13, %r11
-; SSE-NEXT: cmovnel %ebp, %eax
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; SSE-NEXT: orq %rbx, %r9
-; SSE-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT: orq %r15, %rdi
-; SSE-NEXT: orq %r10, %rdi
-; SSE-NEXT: addl $512, %eax # imm = 0x200
+; SSE-NEXT: por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT: ptest %xmm0, %xmm0
+; SSE-NEXT: cmovnel %r14d, %eax
+; SSE-NEXT: orq %r11, %r8
+; SSE-NEXT: orq %r10, %rsi
+; SSE-NEXT: orq %r8, %rsi
+; SSE-NEXT: orq {{[0-9]+}}(%rsp), %rdx
; SSE-NEXT: orq %r9, %rdi
-; SSE-NEXT: cmovnel %r12d, %eax
+; SSE-NEXT: orq %rdx, %rdi
+; SSE-NEXT: addl $512, %eax # imm = 0x200
+; SSE-NEXT: orq %rsi, %rdi
+; SSE-NEXT: cmovnel %ebx, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
; SSE-NEXT: popq %r12
@@ -7607,111 +7083,108 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq %r9, %rbx
-; AVX2-NEXT: movq %r8, %r14
-; AVX2-NEXT: movq %rcx, %r11
-; AVX2-NEXT: movq %rdx, %r10
-; AVX2-NEXT: movq %rsi, %r9
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT: movq %r9, %r10
+; AVX2-NEXT: movq %r8, %r9
+; AVX2-NEXT: movq %rcx, %r8
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: tzcntq %rdi, %rax
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %r9, %r15
-; AVX2-NEXT: addl $64, %r15d
-; AVX2-NEXT: testq %rdi, %rdi
-; AVX2-NEXT: cmovnel %eax, %r15d
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %r10, %r12
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %r11, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %r12d, %eax
-; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: movq %rdi, %r12
-; AVX2-NEXT: orq %r9, %r12
-; AVX2-NEXT: cmovnel %r15d, %eax
-; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %r14, %r15
-; AVX2-NEXT: movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rdi, %rbx
; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %rbx, %r12
+; AVX2-NEXT: tzcntq %rsi, %r12
; AVX2-NEXT: addl $64, %r12d
-; AVX2-NEXT: testq %r14, %r14
-; AVX2-NEXT: cmovnel %r15d, %r12d
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: tzcntq %rcx, %r13
+; AVX2-NEXT: testq %rdi, %rdi
+; AVX2-NEXT: cmovnel %ebx, %r12d
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rdx, %rbx
; AVX2-NEXT: xorl %r15d, %r15d
-; AVX2-NEXT: tzcntq %rdx, %r15
+; AVX2-NEXT: tzcntq %r8, %r15
; AVX2-NEXT: addl $64, %r15d
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: cmovnel %r13d, %r15d
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: cmovnel %ebx, %r15d
; AVX2-NEXT: subl $-128, %r15d
-; AVX2-NEXT: movq %r14, %r13
-; AVX2-NEXT: orq %rbx, %r13
+; AVX2-NEXT: movq %rdi, %rbx
+; AVX2-NEXT: orq %rsi, %rbx
; AVX2-NEXT: cmovnel %r12d, %r15d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: addl $256, %r15d # imm = 0x100
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r9, %rbx
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %r10, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %ebx, %r12d
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %r11, %r13
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %rcx, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %r13d, %ebx
+; AVX2-NEXT: subl $-128, %ebx
; AVX2-NEXT: movq %r9, %r13
-; AVX2-NEXT: orq %r11, %r13
-; AVX2-NEXT: movq %rdi, %rbp
-; AVX2-NEXT: orq %r10, %rbp
-; AVX2-NEXT: orq %r13, %rbp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: cmovnel %eax, %r15d
-; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: tzcntq %r12, %rbp
-; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %r13, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r12, %r12
-; AVX2-NEXT: cmovnel %ebp, %eax
+; AVX2-NEXT: orq %r10, %r13
+; AVX2-NEXT: cmovnel %r12d, %ebx
+; AVX2-NEXT: addl $256, %ebx # imm = 0x100
+; AVX2-NEXT: movq %rsi, %r12
+; AVX2-NEXT: orq %r8, %r12
+; AVX2-NEXT: movq %rdi, %r13
+; AVX2-NEXT: orq %rdx, %r13
+; AVX2-NEXT: orq %r12, %r13
+; AVX2-NEXT: cmovnel %r15d, %ebx
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq %rax, %r15
+; AVX2-NEXT: xorl %r13d, %r13d
+; AVX2-NEXT: tzcntq %r12, %r13
+; AVX2-NEXT: addl $64, %r13d
+; AVX2-NEXT: testq %rax, %rax
+; AVX2-NEXT: cmovnel %r15d, %r13d
; AVX2-NEXT: xorl %ebp, %ebp
-; AVX2-NEXT: tzcntq %r8, %rbp
-; AVX2-NEXT: addl $64, %ebp
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %rsi, %rcx
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %ecx, %ebp
-; AVX2-NEXT: subl $-128, %ebp
-; AVX2-NEXT: movq %r12, %rcx
-; AVX2-NEXT: orq %r13, %rcx
-; AVX2-NEXT: cmovnel %eax, %ebp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %rbx, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: tzcntq %r14, %rbp
+; AVX2-NEXT: xorl %r15d, %r15d
+; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT: addl $64, %r15d
+; AVX2-NEXT: testq %r14, %r14
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT: cmovnel %ebp, %r15d
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
+; AVX2-NEXT: subl $-128, %r15d
+; AVX2-NEXT: orq %r12, %rax
+; AVX2-NEXT: cmovnel %r13d, %r15d
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rdx, %rax
-; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %ecx
+; AVX2-NEXT: tzcntq %rbp, %rax
+; AVX2-NEXT: xorl %r12d, %r12d
+; AVX2-NEXT: tzcntq %r14, %r12
+; AVX2-NEXT: addl $64, %r12d
+; AVX2-NEXT: testq %rbp, %rbp
+; AVX2-NEXT: cmovnel %eax, %r12d
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: tzcntq %r8, %rsi
-; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %esi, %eax
-; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %rbx, %rdx
+; AVX2-NEXT: xorl %ecx, %ecx
+; AVX2-NEXT: tzcntq %r13, %rcx
+; AVX2-NEXT: testq %r13, %r13
; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT: subl $-128, %eax
+; AVX2-NEXT: orq %r14, %rbp
+; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: vmovdqa {{[0-9]+}}(%rsp), %xmm0
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r13, %r12
-; AVX2-NEXT: cmovnel %ebp, %eax
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT: orq %r11, %r9
-; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: orq %r14, %rdi
-; AVX2-NEXT: orq %r10, %rdi
-; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r9, %rdi
+; AVX2-NEXT: vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT: vptest %xmm0, %xmm0
; AVX2-NEXT: cmovnel %r15d, %eax
+; AVX2-NEXT: orq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT: orq %r10, %rsi
+; AVX2-NEXT: orq %r8, %rsi
+; AVX2-NEXT: orq %r11, %rdx
+; AVX2-NEXT: orq %r9, %rdi
+; AVX2-NEXT: orq %rdx, %rdi
+; AVX2-NEXT: addl $512, %eax # imm = 0x200
+; AVX2-NEXT: orq %rsi, %rdi
+; AVX2-NEXT: cmovnel %ebx, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
@@ -7898,384 +7371,296 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
define i32 @load_cttz_undef_i1024(ptr %p0) nounwind {
; SSE-LABEL: load_cttz_undef_i1024:
; SSE: # %bb.0:
-; SSE-NEXT: pushq %rbp
-; SSE-NEXT: pushq %r15
-; SSE-NEXT: pushq %r14
-; SSE-NEXT: pushq %r13
-; SSE-NEXT: pushq %r12
; SSE-NEXT: pushq %rbx
-; SSE-NEXT: movq 72(%rdi), %rbx
-; SSE-NEXT: movq 56(%rdi), %r9
-; SSE-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 48(%rdi), %rcx
-; SSE-NEXT: movq 40(%rdi), %r10
-; SSE-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: movq 32(%rdi), %rsi
-; SSE-NEXT: movq 24(%rdi), %rbp
+; SSE-NEXT: movq 104(%rdi), %rcx
+; SSE-NEXT: movq 40(%rdi), %rax
+; SSE-NEXT: movq 16(%rdi), %rdx
; SSE-NEXT: movq (%rdi), %r8
-; SSE-NEXT: movq 8(%rdi), %r11
-; SSE-NEXT: rep bsfq %r8, %rax
-; SSE-NEXT: rep bsfq %r11, %rdx
-; SSE-NEXT: addl $64, %edx
+; SSE-NEXT: movq 8(%rdi), %r9
+; SSE-NEXT: rep bsfq %r8, %rsi
+; SSE-NEXT: rep bsfq %r9, %r11
+; SSE-NEXT: addl $64, %r11d
; SSE-NEXT: testq %r8, %r8
-; SSE-NEXT: cmovnel %eax, %edx
-; SSE-NEXT: movq 16(%rdi), %r14
-; SSE-NEXT: rep bsfq %r14, %r15
-; SSE-NEXT: rep bsfq %rbp, %rax
-; SSE-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r14, %r14
-; SSE-NEXT: cmovnel %r15d, %eax
-; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: movq %r8, %r15
-; SSE-NEXT: orq %r11, %r15
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: rep bsfq %rsi, %rdx
-; SSE-NEXT: rep bsfq %r10, %r13
-; SSE-NEXT: addl $64, %r13d
-; SSE-NEXT: testq %rsi, %rsi
-; SSE-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: cmovnel %edx, %r13d
-; SSE-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT: rep bsfq %rcx, %rdx
-; SSE-NEXT: rep bsfq %r9, %r15
-; SSE-NEXT: addl $64, %r15d
-; SSE-NEXT: testq %rcx, %rcx
-; SSE-NEXT: cmovnel %edx, %r15d
-; SSE-NEXT: movq 64(%rdi), %r12
-; SSE-NEXT: subl $-128, %r15d
-; SSE-NEXT: movq %rsi, %rdx
-; SSE-NEXT: orq %r10, %rdx
-; SSE-NEXT: cmovnel %r13d, %r15d
-; SSE-NEXT: addl $256, %r15d # imm = 0x100
-; SSE-NEXT: movq %r11, %rdx
-; SSE-NEXT: orq %rbp, %rdx
-; SSE-NEXT: movq %r8, %r13
-; SSE-NEXT: orq %r14, %r13
-; SSE-NEXT: orq %rdx, %r13
-; SSE-NEXT: cmovnel %eax, %r15d
-; SSE-NEXT: rep bsfq %r12, %rdx
-; SSE-NEXT: rep bsfq %rbx, %rax
-; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: testq %r12, %r12
-; SSE-NEXT: cmovnel %edx, %eax
-; SSE-NEXT: movq 88(%rdi), %rbp
-; SSE-NEXT: rep bsfq %rbp, %r13
-; SSE-NEXT: addl $64, %r13d
-; SSE-NEXT: movq 80(%rdi), %r10
-; SSE-NEXT: rep bsfq %r10, %rcx
-; SSE-NEXT: testq %r10, %r10
-; SSE-NEXT: cmovnel %ecx, %r13d
-; SSE-NEXT: subl $-128, %r13d
-; SSE-NEXT: movq %r12, %rcx
-; SSE-NEXT: orq %rbx, %rcx
-; SSE-NEXT: cmovnel %eax, %r13d
-; SSE-NEXT: movq 104(%rdi), %r9
-; SSE-NEXT: rep bsfq %r9, %rcx
-; SSE-NEXT: addl $64, %ecx
-; SSE-NEXT: movq 96(%rdi), %rdx
-; SSE-NEXT: rep bsfq %rdx, %rax
+; SSE-NEXT: cmovnel %esi, %r11d
+; SSE-NEXT: rep bsfq 24(%rdi), %rsi
+; SSE-NEXT: rep bsfq %rdx, %rbx
+; SSE-NEXT: addl $64, %esi
; SSE-NEXT: testq %rdx, %rdx
-; SSE-NEXT: cmovnel %eax, %ecx
+; SSE-NEXT: movq 32(%rdi), %r10
+; SSE-NEXT: cmovnel %ebx, %esi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r9, %r8
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: rep bsfq %r10, %rdx
+; SSE-NEXT: rep bsfq %rax, %r11
+; SSE-NEXT: addl $64, %r11d
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %edx, %r11d
+; SSE-NEXT: movq 48(%rdi), %r8
+; SSE-NEXT: rep bsfq %r8, %r9
+; SSE-NEXT: rep bsfq 56(%rdi), %rdx
+; SSE-NEXT: addl $64, %edx
+; SSE-NEXT: testq %r8, %r8
+; SSE-NEXT: cmovnel %r9d, %edx
+; SSE-NEXT: movq 80(%rdi), %r9
+; SSE-NEXT: movq 72(%rdi), %r8
+; SSE-NEXT: subl $-128, %edx
+; SSE-NEXT: orq %rax, %r10
+; SSE-NEXT: movq 64(%rdi), %rax
+; SSE-NEXT: movdqa 16(%rdi), %xmm0
+; SSE-NEXT: cmovnel %r11d, %edx
+; SSE-NEXT: movdqa (%rdi), %xmm1
+; SSE-NEXT: addl $256, %edx # imm = 0x100
+; SSE-NEXT: movdqa %xmm1, %xmm2
+; SSE-NEXT: por %xmm0, %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %edx
+; SSE-NEXT: rep bsfq %rax, %rsi
+; SSE-NEXT: rep bsfq %r8, %r10
+; SSE-NEXT: addl $64, %r10d
+; SSE-NEXT: testq %rax, %rax
+; SSE-NEXT: cmovnel %esi, %r10d
+; SSE-NEXT: rep bsfq 88(%rdi), %rsi
+; SSE-NEXT: rep bsfq %r9, %r11
+; SSE-NEXT: addl $64, %esi
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: movq 96(%rdi), %r9
+; SSE-NEXT: cmovnel %r11d, %esi
+; SSE-NEXT: subl $-128, %esi
+; SSE-NEXT: orq %r8, %rax
+; SSE-NEXT: cmovnel %r10d, %esi
+; SSE-NEXT: rep bsfq %r9, %rax
+; SSE-NEXT: rep bsfq %rcx, %r8
+; SSE-NEXT: addl $64, %r8d
+; SSE-NEXT: testq %r9, %r9
+; SSE-NEXT: cmovnel %eax, %r8d
+; SSE-NEXT: movq 112(%rdi), %r10
+; SSE-NEXT: rep bsfq %r10, %r11
; SSE-NEXT: rep bsfq 120(%rdi), %rax
-; SSE-NEXT: movq 112(%rdi), %rdi
; SSE-NEXT: addl $64, %eax
-; SSE-NEXT: rep bsfq %rdi, %rsi
-; SSE-NEXT: testq %rdi, %rdi
-; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: testq %r10, %r10
+; SSE-NEXT: cmovnel %r11d, %eax
; SSE-NEXT: subl $-128, %eax
-; SSE-NEXT: orq %r9, %rdx
-; SSE-NEXT: cmovnel %ecx, %eax
-; SSE-NEXT: orq %rbp, %rbx
-; SSE-NEXT: orq %r10, %r12
+; SSE-NEXT: orq %rcx, %r9
+; SSE-NEXT: cmovnel %r8d, %eax
+; SSE-NEXT: movdqa 64(%rdi), %xmm2
; SSE-NEXT: addl $256, %eax # imm = 0x100
-; SSE-NEXT: orq %rbx, %r12
-; SSE-NEXT: cmovnel %r13d, %eax
-; SSE-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT: orq %rcx, %r11
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT: orq %r14, %r8
+; SSE-NEXT: por 80(%rdi), %xmm2
+; SSE-NEXT: ptest %xmm2, %xmm2
+; SSE-NEXT: cmovnel %esi, %eax
+; SSE-NEXT: por 48(%rdi), %xmm0
+; SSE-NEXT: por 32(%rdi), %xmm1
; SSE-NEXT: addl $512, %eax # imm = 0x200
-; SSE-NEXT: orq %r11, %r8
-; SSE-NEXT: cmovnel %r15d, %eax
+; SSE-NEXT: por %xmm0, %xmm1
+; SSE-NEXT: ptest %xmm1, %xmm1
+; SSE-NEXT: cmovnel %edx, %eax
; SSE-NEXT: # kill: def $eax killed $eax killed $rax
; SSE-NEXT: popq %rbx
-; SSE-NEXT: popq %r12
-; SSE-NEXT: popq %r13
-; SSE-NEXT: popq %r14
-; SSE-NEXT: popq %r15
-; SSE-NEXT: popq %rbp
; SSE-NEXT: retq
;
; AVX2-LABEL: load_cttz_undef_i1024:
; AVX2: # %bb.0:
-; AVX2-NEXT: pushq %rbp
-; AVX2-NEXT: pushq %r15
-; AVX2-NEXT: pushq %r14
-; AVX2-NEXT: pushq %r13
-; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: movq 72(%rdi), %r14
-; AVX2-NEXT: movq 64(%rdi), %r15
-; AVX2-NEXT: movq 56(%rdi), %r9
-; AVX2-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 48(%rdi), %rcx
-; AVX2-NEXT: movq 40(%rdi), %r10
-; AVX2-NEXT: movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq 32(%rdi), %rsi
-; AVX2-NEXT: movq 24(%rdi), %rbp
-; AVX2-NEXT: movq 16(%rdi), %rbx
+; AVX2-NEXT: movq 104(%rdi), %rcx
+; AVX2-NEXT: movq 48(%rdi), %rsi
+; AVX2-NEXT: movq 16(%rdi), %rdx
; AVX2-NEXT: movq (%rdi), %r8
-; AVX2-NEXT: movq 8(%rdi), %r11
+; AVX2-NEXT: movq 8(%rdi), %r9
; AVX2-NEXT: tzcntq %r8, %rax
-; AVX2-NEXT: tzcntq %r11, %rdx
-; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r9, %rbx
+; AVX2-NEXT: addl $64, %ebx
; AVX2-NEXT: testq %r8, %r8
-; AVX2-NEXT: cmovnel %eax, %edx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %rbx, %r12
+; AVX2-NEXT: cmovnel %eax, %ebx
+; AVX2-NEXT: tzcntq %rdx, %r11
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rbp, %rax
-; AVX2-NEXT: movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: tzcntq 24(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %rbx, %rbx
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: testq %rdx, %rdx
+; AVX2-NEXT: movq 40(%rdi), %r10
+; AVX2-NEXT: cmovnel %r11d, %eax
+; AVX2-NEXT: movq 32(%rdi), %r11
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: movq %r8, %r12
-; AVX2-NEXT: orq %r11, %r12
-; AVX2-NEXT: cmovnel %edx, %eax
-; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: tzcntq %rsi, %rdx
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: tzcntq %r10, %r13
-; AVX2-NEXT: addl $64, %r13d
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: cmovnel %edx, %r13d
-; AVX2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: orq %r9, %r8
+; AVX2-NEXT: cmovnel %ebx, %eax
; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: tzcntq %rcx, %rdx
-; AVX2-NEXT: xorl %r12d, %r12d
-; AVX2-NEXT: tzcntq %r9, %r12
-; AVX2-NEXT: addl $64, %r12d
-; AVX2-NEXT: testq %rcx, %rcx
-; AVX2-NEXT: cmovnel %edx, %r12d
-; AVX2-NEXT: subl $-128, %r12d
-; AVX2-NEXT: movq %rsi, %rdx
-; AVX2-NEXT: orq %r10, %rdx
-; AVX2-NEXT: cmovnel %r13d, %r12d
-; AVX2-NEXT: addl $256, %r12d # imm = 0x100
-; AVX2-NEXT: movq %r11, %rdx
-; AVX2-NEXT: orq %rbp, %rdx
-; AVX2-NEXT: movq %r8, %r13
-; AVX2-NEXT: orq %rbx, %r13
-; AVX2-NEXT: orq %rdx, %r13
-; AVX2-NEXT: cmovnel %eax, %r12d
+; AVX2-NEXT: tzcntq %r11, %rdx
+; AVX2-NEXT: xorl %ebx, %ebx
+; AVX2-NEXT: tzcntq %r10, %rbx
+; AVX2-NEXT: addl $64, %ebx
+; AVX2-NEXT: testq %r11, %r11
+; AVX2-NEXT: cmovnel %edx, %ebx
; AVX2-NEXT: xorl %edx, %edx
-; AVX2-NEXT: tzcntq %r15, %rdx
+; AVX2-NEXT: tzcntq 56(%rdi), %rdx
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rsi, %r8
+; AVX2-NEXT: addl $64, %edx
+; AVX2-NEXT: testq %rsi, %rsi
+; AVX2-NEXT: movq 80(%rdi), %r9
+; AVX2-NEXT: cmovnel %r8d, %edx
+; AVX2-NEXT: movq 72(%rdi), %r8
+; AVX2-NEXT: subl $-128, %edx
+; AVX2-NEXT: orq %r10, %r11
+; AVX2-NEXT: movq 64(%rdi), %r10
+; AVX2-NEXT: cmovnel %ebx, %edx
+; AVX2-NEXT: vmovdqu (%rdi), %ymm0
+; AVX2-NEXT: addl $256, %edx # imm = 0x100
+; AVX2-NEXT: vpor 16(%rdi), %xmm0, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %eax, %edx
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %r14, %rax
-; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: testq %r15, %r15
-; AVX2-NEXT: cmovnel %edx, %eax
-; AVX2-NEXT: movq 88(%rdi), %rbp
-; AVX2-NEXT: xorl %r13d, %r13d
-; AVX2-NEXT: tzcntq %rbp, %r13
-; AVX2-NEXT: addl $64, %r13d
-; AVX2-NEXT: movq 80(%rdi), %r10
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %r10, %rcx
+; AVX2-NEXT: tzcntq %r10, %rax
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: tzcntq %r8, %r11
+; AVX2-NEXT: addl $64, %r11d
; AVX2-NEXT: testq %r10, %r10
-; AVX2-NEXT: cmovnel %ecx, %r13d
-; AVX2-NEXT: subl $-128, %r13d
-; AVX2-NEXT: movq %r15, %rcx
-; AVX2-NEXT: orq %r14, %rcx
-; AVX2-NEXT: cmovnel %eax, %r13d
-; AVX2-NEXT: movq 104(%rdi), %r9
-; AVX2-NEXT: xorl %ecx, %ecx
-; AVX2-NEXT: tzcntq %r9, %rcx
-; AVX2-NEXT: addl $64, %ecx
-; AVX2-NEXT: movq 96(%rdi), %rdx
+; AVX2-NEXT: cmovnel %eax, %r11d
+; AVX2-NEXT: xorl %esi, %esi
+; AVX2-NEXT: tzcntq 88(%rdi), %rsi
; AVX2-NEXT: xorl %eax, %eax
-; AVX2-NEXT: tzcntq %rdx, %rax
-; AVX2-NEXT: testq %rdx, %rdx
-; AVX2-NEXT: cmovnel %eax, %ecx
-; AVX2-NEXT: movq 112(%rdi), %rsi
+; AVX2-NEXT: tzcntq %r9, %rax
+; AVX2-NEXT: addl $64, %esi
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: movq 96(%rdi), %r9
+; AVX2-NEXT: cmovnel %eax, %esi
+; AVX2-NEXT: subl $-128, %esi
+; AVX2-NEXT: orq %r8, %r10
+; AVX2-NEXT: cmovnel %r11d, %esi
+; AVX2-NEXT: xorl %eax, %eax
+; AVX2-NEXT: tzcntq %r9, %rax
+; AVX2-NEXT: xorl %r8d, %r8d
+; AVX2-NEXT: tzcntq %rcx, %r8
+; AVX2-NEXT: addl $64, %r8d
+; AVX2-NEXT: testq %r9, %r9
+; AVX2-NEXT: cmovnel %eax, %r8d
+; AVX2-NEXT: movq 112(%rdi), %r10
+; AVX2-NEXT: xorl %r11d, %r11d
+; AVX2-NEXT: tzcntq %r10, %r11
; AVX2-NEXT: xorl %eax, %eax
; AVX2-NEXT: tzcntq 120(%rdi), %rax
; AVX2-NEXT: addl $64, %eax
-; AVX2-NEXT: tzcntq %rsi, %rdi
-; AVX2-NEXT: testq %rsi, %rsi
-; AVX2-NEXT: cmovnel %edi, %eax
+; AVX2-NEXT: testq %r10, %r10
+; AVX2-NEXT: cmovnel %r11d, %eax
; AVX2-NEXT: subl $-128, %eax
-; AVX2-NEXT: orq %r9, %rdx
-; AVX2-NEXT: cmovnel %ecx, %eax
-; AVX2-NEXT: orq %rbp, %r14
-; AVX2-NEXT: orq %r10, %r15
+; AVX2-NEXT: orq %rcx, %r9
+; AVX2-NEXT: cmovnel %r8d, %eax
+; AVX2-NEXT: vmovdqa 64(%rdi), %xmm1
; AVX2-NEXT: addl $256, %eax # imm = 0x100
-; AVX2-NEXT: orq %r14, %r15
-; AVX2-NEXT: cmovnel %r13d, %eax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rcx, %r11
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT: orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT: orq %rbx, %r8
+; AVX2-NEXT: vpor 80(%rdi), %xmm1, %xmm1
+; AVX2-NEXT: vptest %xmm1, %xmm1
+; AVX2-NEXT: cmovnel %esi, %eax
+; AVX2-NEXT: vpor 32(%rdi), %ymm0, %ymm0
; AVX2-NEXT: addl $512, %eax # imm = 0x200
-; AVX2-NEXT: orq %r11, %r8
-; AVX2-NEXT: cmovnel %r12d, %eax
+; AVX2-NEXT: vptest %ymm0, %ymm0
+; AVX2-NEXT: cmovnel %edx, %eax
; AVX2-NEXT: # kill: def $eax killed $eax killed $rax
; AVX2-NEXT: popq %rbx
-; AVX2-NEXT: popq %r12
-; AVX2-NEXT: popq %r13
-; AVX2-NEXT: popq %r14
-; AVX2-NEXT: popq %r15
-; AVX2-NEXT: popq %rbp
+; AVX2-NEXT: vzeroupper
; AVX2-NEXT: retq
;
; AVX512F-LABEL: load_cttz_undef_i1024:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512F-NEXT: movq 16(%rdi), %rax
-; AVX512F-NEXT: movq (%rdi), %rcx
-; AVX512F-NEXT: movq 8(%rdi), %rdx
-; AVX512F-NEXT: movq 24(%rdi), %rsi
-; AVX512F-NEXT: orq 56(%rdi), %rsi
-; AVX512F-NEXT: orq 40(%rdi), %rdx
-; AVX512F-NEXT: orq 48(%rdi), %rax
-; AVX512F-NEXT: orq %rsi, %rdx
-; AVX512F-NEXT: orq 32(%rdi), %rcx
-; AVX512F-NEXT: orq %rax, %rcx
+; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512F-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512F-NEXT: vplzcntq %zmm3, %zmm3
; AVX512F-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
; AVX512F-NEXT: vpsubq %zmm3, %zmm4, %zmm3
-; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512F-NEXT: vmovd %xmm1, %esi
-; AVX512F-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512F-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT: vplzcntq %zmm1, %zmm1
-; AVX512F-NEXT: vpsubq %zmm1, %zmm4, %zmm1
; AVX512F-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT: vmovd %xmm0, %eax
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm3, %ecx
+; AVX512F-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT: vplzcntq %zmm2, %zmm2
+; AVX512F-NEXT: vpsubq %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT: vmovd %xmm1, %eax
; AVX512F-NEXT: addl $512, %eax # imm = 0x200
-; AVX512F-NEXT: orq %rdx, %rcx
-; AVX512F-NEXT: cmovnel %esi, %eax
+; AVX512F-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT: vptest %ymm0, %ymm0
+; AVX512F-NEXT: cmovnel %ecx, %eax
; AVX512F-NEXT: retq
;
; AVX512POPCNT-LABEL: load_cttz_undef_i1024:
; AVX512POPCNT: # %bb.0:
-; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512POPCNT-NEXT: movq 16(%rdi), %rax
-; AVX512POPCNT-NEXT: movq (%rdi), %rcx
-; AVX512POPCNT-NEXT: movq 8(%rdi), %rdx
-; AVX512POPCNT-NEXT: movq 24(%rdi), %rsi
-; AVX512POPCNT-NEXT: orq 56(%rdi), %rsi
-; AVX512POPCNT-NEXT: orq 40(%rdi), %rdx
-; AVX512POPCNT-NEXT: orq 48(%rdi), %rax
-; AVX512POPCNT-NEXT: orq %rsi, %rdx
-; AVX512POPCNT-NEXT: orq 32(%rdi), %rcx
-; AVX512POPCNT-NEXT: orq %rax, %rcx
+; AVX512POPCNT-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512POPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512POPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512POPCNT-NEXT: vpopcntq %zmm3, %zmm3
; AVX512POPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
-; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT: vmovd %xmm1, %esi
-; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512POPCNT-NEXT: vpopcntq %zmm1, %zmm1
-; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1
; AVX512POPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm3, %ecx
+; AVX512POPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT: vpopcntq %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
+; AVX512POPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT: vmovd %xmm1, %eax
; AVX512POPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT: orq %rdx, %rcx
-; AVX512POPCNT-NEXT: cmovnel %esi, %eax
+; AVX512POPCNT-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT: cmovnel %ecx, %eax
; AVX512POPCNT-NEXT: retq
;
; AVX512VL-LABEL: load_cttz_undef_i1024:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512VL-NEXT: movq 16(%rdi), %rax
-; AVX512VL-NEXT: movq (%rdi), %rcx
-; AVX512VL-NEXT: movq 8(%rdi), %rdx
-; AVX512VL-NEXT: movq 24(%rdi), %rsi
-; AVX512VL-NEXT: orq 56(%rdi), %rsi
-; AVX512VL-NEXT: orq 40(%rdi), %rdx
-; AVX512VL-NEXT: orq 48(%rdi), %rax
-; AVX512VL-NEXT: orq 32(%rdi), %rcx
-; AVX512VL-NEXT: orq %rsi, %rdx
-; AVX512VL-NEXT: orq %rax, %rcx
+; AVX512VL-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512VL-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VL-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512VL-NEXT: vplzcntq %zmm3, %zmm3
; AVX512VL-NEXT: vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
; AVX512VL-NEXT: vpsubq %zmm3, %zmm4, %zmm3
-; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512VL-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm1, %esi
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VL-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT: vplzcntq %zmm1, %zmm1
-; AVX512VL-NEXT: vpsubq %zmm1, %zmm4, %zmm1
; AVX512VL-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT: vmovd %xmm0, %eax
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm3, %ecx
+; AVX512VL-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT: vplzcntq %zmm2, %zmm2
+; AVX512VL-NEXT: vpsubq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vmovd %xmm1, %eax
; AVX512VL-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT: orq %rdx, %rcx
-; AVX512VL-NEXT: cmovnel %esi, %eax
+; AVX512VL-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT: vptest %ymm0, %ymm0
+; AVX512VL-NEXT: cmovnel %ecx, %eax
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VLPOPCNT-LABEL: load_cttz_undef_i1024:
; AVX512VLPOPCNT: # %bb.0:
-; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm0
-; AVX512VLPOPCNT-NEXT: vmovdqu64 (%rdi), %zmm1
-; AVX512VLPOPCNT-NEXT: movq 16(%rdi), %rax
-; AVX512VLPOPCNT-NEXT: movq (%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: movq 8(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: movq 24(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: orq 56(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT: orq 40(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT: orq 48(%rdi), %rax
-; AVX512VLPOPCNT-NEXT: orq 32(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT: orq %rsi, %rdx
-; AVX512VLPOPCNT-NEXT: orq %rax, %rcx
+; AVX512VLPOPCNT-NEXT: vmovdqu64 (%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT: vmovdqu64 64(%rdi), %zmm1
; AVX512VLPOPCNT-NEXT: vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VLPOPCNT-NEXT: vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VLPOPCNT-NEXT: vpandnq %zmm3, %zmm0, %zmm3
; AVX512VLPOPCNT-NEXT: vpopcntq %zmm3, %zmm3
; AVX512VLPOPCNT-NEXT: vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm3, %zmm3
-; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %esi
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT: vpopcntq %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm1, %zmm1
; AVX512VLPOPCNT-NEXT: vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT: vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT: vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm3, %ecx
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT: vpopcntq %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vpaddq %zmm4, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT: vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT: vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT: vmovd %xmm1, %eax
; AVX512VLPOPCNT-NEXT: addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT: orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT: cmovnel %esi, %eax
+; AVX512VLPOPCNT-NEXT: vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT: cmovnel %ecx, %eax
; AVX512VLPOPCNT-NEXT: vzeroupper
; AVX512VLPOPCNT-NEXT: retq
%a0 = load i1024, ptr %p0
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i256.ll b/llvm/test/CodeGen/X86/funnel-shift-i256.ll
index 549b6e3fc0dd9..cc0c1ef23c3a5 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i256.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i256.ll
@@ -760,107 +760,71 @@ define i256 @fshl_rot_i256_load(ptr %p0, i256 %a2) nounwind {
;
; AVX512F-LABEL: fshl_rot_i256_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq 16(%rsi), %rdx
-; AVX512F-NEXT: movq (%rsi), %r9
-; AVX512F-NEXT: movq 8(%rsi), %r8
-; AVX512F-NEXT: movq 24(%rsi), %rax
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: cmovneq %r8, %rsi
-; AVX512F-NEXT: movq %r9, %r10
-; AVX512F-NEXT: cmovneq %rdx, %r10
-; AVX512F-NEXT: cmovneq %rax, %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: cmovneq %r9, %rdx
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: movq %rdx, %rdi
-; AVX512F-NEXT: cmovneq %r8, %rdi
-; AVX512F-NEXT: cmovneq %r10, %r8
-; AVX512F-NEXT: cmoveq %rsi, %rdx
-; AVX512F-NEXT: cmovneq %rsi, %r10
-; AVX512F-NEXT: movq %r10, %rsi
-; AVX512F-NEXT: shldq %cl, %rdx, %rsi
-; AVX512F-NEXT: movq %r8, %r9
-; AVX512F-NEXT: shldq %cl, %r10, %r9
-; AVX512F-NEXT: movq %rdi, %r10
-; AVX512F-NEXT: shldq %cl, %r8, %r10
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %rdx, 24(%rax)
-; AVX512F-NEXT: movq %r10, 16(%rax)
-; AVX512F-NEXT: movq %r9, 8(%rax)
-; AVX512F-NEXT: movq %rsi, (%rax)
+; AVX512F-NEXT: movzbl %dl, %ecx
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpexpandq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm3[7,0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm4, %zmm4
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: vpsllq %xmm0, %zmm3, %zmm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; AVX512F-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_rot_i256_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq %rdx, %rcx
-; AVX512VL-NEXT: movq 16(%rsi), %rdx
-; AVX512VL-NEXT: movq (%rsi), %r8
-; AVX512VL-NEXT: movq 8(%rsi), %rax
-; AVX512VL-NEXT: movq 24(%rsi), %rsi
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: movq %rsi, %r9
-; AVX512VL-NEXT: cmovneq %rax, %r9
-; AVX512VL-NEXT: movq %r8, %r10
-; AVX512VL-NEXT: cmovneq %rdx, %r10
-; AVX512VL-NEXT: cmovneq %rsi, %rax
-; AVX512VL-NEXT: cmovneq %r8, %rdx
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: movq %rdx, %rsi
-; AVX512VL-NEXT: cmovneq %rax, %rsi
-; AVX512VL-NEXT: cmovneq %r10, %rax
-; AVX512VL-NEXT: cmoveq %r9, %rdx
-; AVX512VL-NEXT: cmovneq %r9, %r10
-; AVX512VL-NEXT: movq %r10, %r8
-; AVX512VL-NEXT: shldq %cl, %rdx, %r8
-; AVX512VL-NEXT: movq %rax, %r9
-; AVX512VL-NEXT: shldq %cl, %r10, %r9
-; AVX512VL-NEXT: movq %rsi, %r10
-; AVX512VL-NEXT: shldq %cl, %rax, %r10
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %rsi, %rdx
+; AVX512VL-NEXT: movzbl %dl, %eax
+; AVX512VL-NEXT: vpbroadcastq %rax, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %ecx
+; AVX512VL-NEXT: shlxl %eax, %ecx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpexpandq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: valignq {{.*#+}} zmm4 = zmm3[7,0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm4, %zmm4
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: vpsllq %xmm0, %zmm3, %zmm0
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %rdx, 24(%rdi)
-; AVX512VL-NEXT: movq %r10, 16(%rdi)
-; AVX512VL-NEXT: movq %r9, 8(%rdi)
-; AVX512VL-NEXT: movq %r8, (%rdi)
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm2, %ymm1
+; AVX512VL-NEXT: vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_rot_i256_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: movq %rdx, %rcx
-; AVX512VBMI-NEXT: movq 16(%rsi), %rdx
-; AVX512VBMI-NEXT: movq (%rsi), %r8
-; AVX512VBMI-NEXT: movq 8(%rsi), %rax
-; AVX512VBMI-NEXT: movq 24(%rsi), %rsi
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: movq %rsi, %r9
-; AVX512VBMI-NEXT: cmovneq %rax, %r9
-; AVX512VBMI-NEXT: movq %r8, %r10
-; AVX512VBMI-NEXT: cmovneq %rdx, %r10
-; AVX512VBMI-NEXT: cmovneq %rsi, %rax
-; AVX512VBMI-NEXT: cmovneq %r8, %rdx
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: movq %rdx, %rsi
-; AVX512VBMI-NEXT: cmovneq %rax, %rsi
-; AVX512VBMI-NEXT: cmovneq %r10, %rax
-; AVX512VBMI-NEXT: cmoveq %r9, %rdx
-; AVX512VBMI-NEXT: cmovneq %r9, %r10
-; AVX512VBMI-NEXT: movq %r10, %r8
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %r8
-; AVX512VBMI-NEXT: movq %rax, %r9
-; AVX512VBMI-NEXT: shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT: movq %rsi, %r10
-; AVX512VBMI-NEXT: shldq %cl, %rax, %r10
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %rsi, %rdx
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %rdx, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 16(%rdi)
-; AVX512VBMI-NEXT: movq %r9, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r8, (%rdi)
+; AVX512VBMI-NEXT: movzbl %dl, %ecx
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: movl $-1, %edi
+; AVX512VBMI-NEXT: shlxl %edx, %edi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, (%rax)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i256, ptr %p0
%r = call i256 @llvm.fshl.i256(i256 %a0, i256 %a0, i256 %a2)
@@ -936,101 +900,67 @@ define i256 @fshr_rot_i256_load(ptr %p0, i256 %a2) nounwind {
;
; AVX512F-LABEL: fshr_rot_i256_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq 16(%rsi), %r8
-; AVX512F-NEXT: movq (%rsi), %r9
-; AVX512F-NEXT: movq 8(%rsi), %rdx
-; AVX512F-NEXT: movq 24(%rsi), %rax
-; AVX512F-NEXT: testb %cl, %cl
-; AVX512F-NEXT: movq %rax, %r10
-; AVX512F-NEXT: cmovnsq %rdx, %r10
-; AVX512F-NEXT: movq %r9, %rsi
-; AVX512F-NEXT: cmovnsq %r8, %rsi
-; AVX512F-NEXT: cmovnsq %rax, %rdx
+; AVX512F-NEXT: movzbl %dl, %eax
+; AVX512F-NEXT: vmovq %rax, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %ecx
+; AVX512F-NEXT: shlxl %eax, %ecx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7,0]
+; AVX512F-NEXT: vpaddq %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT: vpsllq %xmm0, %ymm1, %ymm0
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: cmovnsq %r9, %r8
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: movq %r8, %rdi
-; AVX512F-NEXT: cmoveq %rdx, %rdi
-; AVX512F-NEXT: cmoveq %rsi, %rdx
-; AVX512F-NEXT: cmoveq %r10, %rsi
-; AVX512F-NEXT: cmovneq %r10, %r8
-; AVX512F-NEXT: movq %r8, %r9
-; AVX512F-NEXT: shrdq %cl, %rsi, %r9
-; AVX512F-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT: shrdq %cl, %rdi, %rdx
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shrdq %cl, %r8, %rdi
-; AVX512F-NEXT: movq %rdi, 24(%rax)
-; AVX512F-NEXT: movq %rdx, 16(%rax)
-; AVX512F-NEXT: movq %rsi, 8(%rax)
-; AVX512F-NEXT: movq %r9, (%rax)
+; AVX512F-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT: vmovdqu %ymm0, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshr_rot_i256_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: movq %rdx, %rcx
-; AVX512VL-NEXT: movq 16(%rsi), %rax
-; AVX512VL-NEXT: movq (%rsi), %r8
-; AVX512VL-NEXT: movq 8(%rsi), %rdx
-; AVX512VL-NEXT: movq 24(%rsi), %r9
-; AVX512VL-NEXT: testb %cl, %cl
-; AVX512VL-NEXT: movq %r9, %r10
-; AVX512VL-NEXT: cmovnsq %rdx, %r10
-; AVX512VL-NEXT: movq %r8, %rsi
-; AVX512VL-NEXT: cmovnsq %rax, %rsi
-; AVX512VL-NEXT: cmovnsq %r9, %rdx
-; AVX512VL-NEXT: cmovnsq %r8, %rax
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: movq %rax, %r8
-; AVX512VL-NEXT: cmoveq %rdx, %r8
-; AVX512VL-NEXT: cmoveq %rsi, %rdx
-; AVX512VL-NEXT: cmoveq %r10, %rsi
-; AVX512VL-NEXT: cmovneq %r10, %rax
-; AVX512VL-NEXT: movq %rax, %r9
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r9
-; AVX512VL-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shrdq %cl, %rax, %r8
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %r8, 24(%rdi)
-; AVX512VL-NEXT: movq %rdx, 16(%rdi)
-; AVX512VL-NEXT: movq %rsi, 8(%rdi)
-; AVX512VL-NEXT: movq %r9, (%rdi)
+; AVX512VL-NEXT: movzbl %dl, %ecx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT: vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT: vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm2, %ymm3, %ymm2
+; AVX512VL-NEXT: vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7,0]
+; AVX512VL-NEXT: vpaddq %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT: vpsllq %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT: vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshr_rot_i256_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: movq %rdx, %rcx
-; AVX512VBMI-NEXT: movq 16(%rsi), %rax
-; AVX512VBMI-NEXT: movq (%rsi), %r8
-; AVX512VBMI-NEXT: movq 8(%rsi), %rdx
-; AVX512VBMI-NEXT: movq 24(%rsi), %r9
-; AVX512VBMI-NEXT: testb %cl, %cl
-; AVX512VBMI-NEXT: movq %r9, %r10
-; AVX512VBMI-NEXT: cmovnsq %rdx, %r10
-; AVX512VBMI-NEXT: movq %r8, %rsi
-; AVX512VBMI-NEXT: cmovnsq %rax, %rsi
-; AVX512VBMI-NEXT: cmovnsq %r9, %rdx
-; AVX512VBMI-NEXT: cmovnsq %r8, %rax
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: movq %rax, %r8
-; AVX512VBMI-NEXT: cmoveq %rdx, %r8
-; AVX512VBMI-NEXT: cmoveq %rsi, %rdx
-; AVX512VBMI-NEXT: cmoveq %r10, %rsi
-; AVX512VBMI-NEXT: cmovneq %r10, %rax
-; AVX512VBMI-NEXT: movq %rax, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r9
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shrdq %cl, %rax, %r8
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r9, (%rdi)
+; AVX512VBMI-NEXT: movzbl %dl, %ecx
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: movl $-1, %edi
+; AVX512VBMI-NEXT: shlxl %edx, %edi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rax)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i256, ptr %p0
%r = call i256 @llvm.fshr.i256(i256 %a0, i256 %a0, i256 %a2)
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i512.ll b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
index c6105d2170cf1..9edd8bb7c2f12 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i512.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
@@ -1837,161 +1837,127 @@ define i512 @fshl_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $136, %rsp
-; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512F-NEXT: vmovups (%rdx), %zmm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovups %zmm1, (%rsp)
-; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm1
+; AVX512F-NEXT: vmovdqu64 (%rdx), %zmm2
+; AVX512F-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm3
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT: vpand %xmm0, %xmm3, %xmm4
; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: andl $511, %edx # imm = 0x1FF
-; AVX512F-NEXT: movl $512, %esi # imm = 0x200
-; AVX512F-NEXT: subq %rdx, %rsi
-; AVX512F-NEXT: movl %esi, %edi
-; AVX512F-NEXT: andl $63, %edi
-; AVX512F-NEXT: vmovq %rdi, %xmm1
-; AVX512F-NEXT: vpbroadcastq %xmm1, %xmm1
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: vmovdqu64 (%rsp,%rsi), %zmm3
-; AVX512F-NEXT: vpsrlq %xmm1, %zmm3, %zmm4
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [63,63]
-; AVX512F-NEXT: vpandn %xmm5, %xmm1, %xmm1
-; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7],zmm2[0]
-; AVX512F-NEXT: vpaddq %zmm3, %zmm3, %zmm3
-; AVX512F-NEXT: vpsllq %xmm1, %zmm3, %zmm1
-; AVX512F-NEXT: vporq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT: movl %ecx, %esi
-; AVX512F-NEXT: andl $63, %esi
-; AVX512F-NEXT: vmovq %rsi, %xmm3
-; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
-; AVX512F-NEXT: vpandn %xmm5, %xmm3, %xmm4
-; AVX512F-NEXT: shrl $3, %ecx
-; AVX512F-NEXT: andl $56, %ecx
-; AVX512F-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT: subq %rcx, %rsi
-; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm5
-; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm2[7],zmm5[0,1,2,3,4,5,6]
-; AVX512F-NEXT: vpsrlq $1, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq %xmm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpsllq %xmm3, %zmm5, %zmm3
-; AVX512F-NEXT: xorl %ecx, %ecx
-; AVX512F-NEXT: negq %rdx
-; AVX512F-NEXT: sbbl %ecx, %ecx
-; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512F-NEXT: kmovw %ecx, %k1
-; AVX512F-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512F-NEXT: vmovdqu64 %zmm0, (%rax)
-; AVX512F-NEXT: addq $136, %rsp
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: movl $-1, %esi
+; AVX512F-NEXT: shlxl %edx, %esi, %edx
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: vpexpandq %zmm1, %zmm5 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm4, %zmm5, %zmm4
+; AVX512F-NEXT: vpandn %xmm0, %xmm3, %xmm3
+; AVX512F-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT: valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm5, %zmm5
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm5, %zmm3
+; AVX512F-NEXT: vporq %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT: movl $512, %edx # imm = 0x200
+; AVX512F-NEXT: subq %rcx, %rdx
+; AVX512F-NEXT: vmovq %rdx, %xmm4
+; AVX512F-NEXT: vpbroadcastq %xmm4, %xmm4
+; AVX512F-NEXT: vpand %xmm0, %xmm4, %xmm5
+; AVX512F-NEXT: shrl $6, %edx
+; AVX512F-NEXT: shlxl %edx, %esi, %edx
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: vpcompressq %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm5, %zmm2, %zmm5
+; AVX512F-NEXT: vpandn %xmm0, %xmm4, %xmm0
+; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7],zmm6[0]
+; AVX512F-NEXT: vpaddq %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT: vpsllq %xmm0, %zmm2, %zmm0
+; AVX512F-NEXT: xorl %edx, %edx
+; AVX512F-NEXT: negq %rcx
+; AVX512F-NEXT: sbbl %edx, %edx
+; AVX512F-NEXT: vporq %zmm5, %zmm0, %zmm0
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: vporq %zmm0, %zmm3, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: subq $136, %rsp
; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT: vmovups (%rdx), %ymm2
-; AVX512VL-NEXT: vmovups 32(%rdx), %ymm3
-; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, (%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: andl $511, %edx # imm = 0x1FF
-; AVX512VL-NEXT: movl $512, %eax # imm = 0x200
-; AVX512VL-NEXT: subq %rdx, %rax
-; AVX512VL-NEXT: movl %eax, %esi
-; AVX512VL-NEXT: andl $63, %esi
-; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm1
-; AVX512VL-NEXT: shrl $3, %eax
-; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: vmovdqu64 (%rsp,%rax), %zmm2
-; AVX512VL-NEXT: vpsrlq %xmm1, %zmm2, %zmm3
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm4 = [63,63]
-; AVX512VL-NEXT: vpandn %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: valignq {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7],zmm5[0]
-; AVX512VL-NEXT: vpaddq %zmm2, %zmm2, %zmm2
-; AVX512VL-NEXT: vpsllq %xmm1, %zmm2, %zmm1
+; AVX512VL-NEXT: vmovdqu64 (%rdx), %zmm1
+; AVX512VL-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm2
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT: vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT: movl %ecx, %eax
+; AVX512VL-NEXT: shrl $6, %eax
+; AVX512VL-NEXT: movl $-1, %edx
+; AVX512VL-NEXT: shlxl %eax, %edx, %eax
+; AVX512VL-NEXT: kmovd %eax, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm5 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm4, %zmm5, %zmm4
+; AVX512VL-NEXT: vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT: vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm5, %zmm5
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm5, %zmm2
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: vporq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT: movl %ecx, %esi
-; AVX512VL-NEXT: andl $63, %esi
-; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
-; AVX512VL-NEXT: vpandn %xmm4, %xmm2, %xmm3
-; AVX512VL-NEXT: shrl $3, %ecx
-; AVX512VL-NEXT: andl $56, %ecx
-; AVX512VL-NEXT: leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512VL-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: movl $512, %esi # imm = 0x200
; AVX512VL-NEXT: subq %rcx, %rsi
-; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm4
-; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm5[7],zmm4[0,1,2,3,4,5,6]
-; AVX512VL-NEXT: vpsrlq $1, %zmm5, %zmm5
-; AVX512VL-NEXT: vpsrlq %xmm3, %zmm5, %zmm3
-; AVX512VL-NEXT: vpsllq %xmm2, %zmm4, %zmm2
-; AVX512VL-NEXT: vporq %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT: xorl %ecx, %ecx
-; AVX512VL-NEXT: negq %rdx
-; AVX512VL-NEXT: sbbl %ecx, %ecx
-; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm4
+; AVX512VL-NEXT: vpand %xmm3, %xmm4, %xmm5
+; AVX512VL-NEXT: shrl $6, %esi
+; AVX512VL-NEXT: shlxl %esi, %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm5, %zmm1, %zmm5
+; AVX512VL-NEXT: vpandn %xmm3, %xmm4, %xmm3
+; AVX512VL-NEXT: valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm6[0]
+; AVX512VL-NEXT: vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: vporq %zmm5, %zmm1, %zmm1
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: negq %rcx
+; AVX512VL-NEXT: sbbl %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
-; AVX512VL-NEXT: addq $136, %rsp
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: subq $136, %rsp
-; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm1
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: vmovups (%rdx), %ymm2
-; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
-; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT: vmovdqu %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm3, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, (%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %ecx, %esi
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: vmovdqu64 (%rdx), %zmm1
+; AVX512VBMI-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VBMI-NEXT: movl $512, %edx # imm = 0x200
+; AVX512VBMI-NEXT: subq %rcx, %rdx
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm2
+; AVX512VBMI-NEXT: # kill: def $edx killed $edx killed $rdx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpshrdvq %zmm2, %zmm4, %zmm1
; AVX512VBMI-NEXT: movl %ecx, %edx
-; AVX512VBMI-NEXT: andl $511, %edx # imm = 0x1FF
-; AVX512VBMI-NEXT: movl $512, %edi # imm = 0x200
-; AVX512VBMI-NEXT: subq %rdx, %rdi
-; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm1
-; AVX512VBMI-NEXT: # kill: def $edi killed $edi killed $rdi def $rdi
-; AVX512VBMI-NEXT: shrl $3, %edi
-; AVX512VBMI-NEXT: andl $56, %edi
-; AVX512VBMI-NEXT: vmovdqu64 (%rsp,%rdi), %zmm2
-; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm3
-; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm5 = zmm2[1,2,3,4,5,6,7],zmm4[0]
-; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm5, %zmm2
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: leaq -{{[0-9]+}}(%rsp), %rcx
-; AVX512VBMI-NEXT: subq %rsi, %rcx
-; AVX512VBMI-NEXT: vmovdqu64 (%rcx), %zmm1
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm4[7],zmm1[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT: vpshldvq %zmm3, %zmm4, %zmm1
-; AVX512VBMI-NEXT: xorl %ecx, %ecx
-; AVX512VBMI-NEXT: negq %rdx
-; AVX512VBMI-NEXT: sbbl %ecx, %ecx
-; AVX512VBMI-NEXT: kmovd %ecx, %k1
-; AVX512VBMI-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
-; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
-; AVX512VBMI-NEXT: addq $136, %rsp
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm3[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT: vpshldvq %zmm4, %zmm3, %zmm2
+; AVX512VBMI-NEXT: xorl %edx, %edx
+; AVX512VBMI-NEXT: negq %rcx
+; AVX512VBMI-NEXT: sbbl %edx, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rdi)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -2163,161 +2129,124 @@ define i512 @fshr_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshr_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: subq $136, %rsp
-; AVX512F-NEXT: vmovups (%rsi), %zmm0
; AVX512F-NEXT: vmovdqu64 (%rdx), %zmm1
-; AVX512F-NEXT: vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT: vmovdqu64 %zmm2, (%rsp)
-; AVX512F-NEXT: vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT: vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rcx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
+; AVX512F-NEXT: movl %ecx, %eax
+; AVX512F-NEXT: shrl $6, %eax
+; AVX512F-NEXT: movl $-1, %edx
+; AVX512F-NEXT: shlxl %eax, %edx, %eax
+; AVX512F-NEXT: kmovw %eax, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm4 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT: movl %ecx, %edx
-; AVX512F-NEXT: andl $511, %edx # imm = 0x1FF
-; AVX512F-NEXT: movl $512, %esi # imm = 0x200
-; AVX512F-NEXT: subq %rdx, %rsi
-; AVX512F-NEXT: movl %esi, %edi
-; AVX512F-NEXT: andl $63, %edi
-; AVX512F-NEXT: vmovq %rdi, %xmm0
-; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm0
-; AVX512F-NEXT: shrl $3, %esi
-; AVX512F-NEXT: andl $56, %esi
-; AVX512F-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT: subq %rsi, %rdi
-; AVX512F-NEXT: vmovdqu64 (%rdi), %zmm3
-; AVX512F-NEXT: vpsllq %xmm0, %zmm3, %zmm4
-; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm5 = [63,63]
-; AVX512F-NEXT: vpandn %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm2[7],zmm3[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpandn %xmm0, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT: vpaddq %zmm4, %zmm4, %zmm4
+; AVX512F-NEXT: vpsllq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: movl $512, %edi # imm = 0x200
+; AVX512F-NEXT: subq %rcx, %rdi
+; AVX512F-NEXT: vmovq %rdi, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: shrl $6, %edi
+; AVX512F-NEXT: shlxl %edi, %edx, %edx
+; AVX512F-NEXT: kmovw %edx, %k1
+; AVX512F-NEXT: vpexpandq (%rsi), %zmm4 {%k1} {z}
+; AVX512F-NEXT: vpand %xmm0, %xmm3, %xmm6
+; AVX512F-NEXT: vpsllq %xmm6, %zmm4, %zmm6
+; AVX512F-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm5[7],zmm4[0,1,2,3,4,5,6]
; AVX512F-NEXT: vpsrlq $1, %zmm3, %zmm3
; AVX512F-NEXT: vpsrlq %xmm0, %zmm3, %zmm0
-; AVX512F-NEXT: vporq %zmm0, %zmm4, %zmm0
-; AVX512F-NEXT: movl %ecx, %esi
-; AVX512F-NEXT: andl $63, %esi
-; AVX512F-NEXT: vmovq %rsi, %xmm3
-; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
-; AVX512F-NEXT: vpandn %xmm5, %xmm3, %xmm4
-; AVX512F-NEXT: shrl $3, %ecx
-; AVX512F-NEXT: andl $56, %ecx
-; AVX512F-NEXT: vmovdqu64 -128(%rsp,%rcx), %zmm5
-; AVX512F-NEXT: valignq {{.*#+}} zmm2 = zmm5[1,2,3,4,5,6,7],zmm2[0]
-; AVX512F-NEXT: vpaddq %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT: vpsllq %xmm4, %zmm2, %zmm2
-; AVX512F-NEXT: vpsrlq %xmm3, %zmm5, %zmm3
-; AVX512F-NEXT: xorl %ecx, %ecx
-; AVX512F-NEXT: negq %rdx
-; AVX512F-NEXT: sbbl %ecx, %ecx
-; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: xorl %edx, %edx
+; AVX512F-NEXT: negq %rcx
+; AVX512F-NEXT: sbbl %edx, %edx
+; AVX512F-NEXT: vporq %zmm0, %zmm6, %zmm0
+; AVX512F-NEXT: kmovw %edx, %k1
; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm1 {%k1}
; AVX512F-NEXT: vmovdqu64 %zmm1, (%rax)
-; AVX512F-NEXT: addq $136, %rsp
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshr_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: subq $136, %rsp
-; AVX512VL-NEXT: vmovups (%rsi), %ymm1
-; AVX512VL-NEXT: vmovups 32(%rsi), %ymm2
+; AVX512VL-NEXT: movq %rdi, %rax
; AVX512VL-NEXT: vmovdqu64 (%rdx), %zmm0
-; AVX512VL-NEXT: vmovups 32(%rdx), %ymm3
-; AVX512VL-NEXT: vxorps %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT: vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, (%rsp)
-; AVX512VL-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT: andl $511, %ecx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
; AVX512VL-NEXT: movl %ecx, %edx
-; AVX512VL-NEXT: andl $511, %edx # imm = 0x1FF
-; AVX512VL-NEXT: movl $512, %eax # imm = 0x200
-; AVX512VL-NEXT: subq %rdx, %rax
-; AVX512VL-NEXT: movl %eax, %esi
-; AVX512VL-NEXT: andl $63, %esi
-; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm1
-; AVX512VL-NEXT: shrl $3, %eax
-; AVX512VL-NEXT: andl $56, %eax
-; AVX512VL-NEXT: leaq {{[0-9]+}}(%rsp), %rsi
-; AVX512VL-NEXT: subq %rax, %rsi
-; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm2
-; AVX512VL-NEXT: vpsllq %xmm1, %zmm2, %zmm3
-; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm4 = [63,63]
-; AVX512VL-NEXT: vpandn %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: movl $-1, %edi
+; AVX512VL-NEXT: shlxl %edx, %edi, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT: valignq {{.*#+}} zmm2 = zmm5[7],zmm2[0,1,2,3,4,5,6]
-; AVX512VL-NEXT: vpsrlq $1, %zmm2, %zmm2
-; AVX512VL-NEXT: vpsrlq %xmm1, %zmm2, %zmm1
-; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT: movl %ecx, %esi
-; AVX512VL-NEXT: andl $63, %esi
-; AVX512VL-NEXT: vpbroadcastq %rsi, %xmm2
-; AVX512VL-NEXT: vpandn %xmm4, %xmm2, %xmm3
-; AVX512VL-NEXT: shrl $3, %ecx
-; AVX512VL-NEXT: andl $56, %ecx
-; AVX512VL-NEXT: vmovdqu64 -128(%rsp,%rcx), %zmm4
-; AVX512VL-NEXT: valignq {{.*#+}} zmm5 = zmm4[1,2,3,4,5,6,7],zmm5[0]
-; AVX512VL-NEXT: vpaddq %zmm5, %zmm5, %zmm5
-; AVX512VL-NEXT: vpsllq %xmm3, %zmm5, %zmm3
-; AVX512VL-NEXT: vpsrlq %xmm2, %zmm4, %zmm2
-; AVX512VL-NEXT: vporq %zmm2, %zmm3, %zmm2
-; AVX512VL-NEXT: xorl %ecx, %ecx
-; AVX512VL-NEXT: negq %rdx
-; AVX512VL-NEXT: sbbl %ecx, %ecx
-; AVX512VL-NEXT: kmovd %ecx, %k1
-; AVX512VL-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rdi)
-; AVX512VL-NEXT: addq $136, %rsp
+; AVX512VL-NEXT: valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm4, %zmm4, %zmm4
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: movl $512, %edx # imm = 0x200
+; AVX512VL-NEXT: subq %rcx, %rdx
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm3
+; AVX512VL-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT: shrl $6, %edx
+; AVX512VL-NEXT: shlxl %edx, %edi, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vpexpandq (%rsi), %zmm6 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm3, %zmm3
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: xorl %edx, %edx
+; AVX512VL-NEXT: negq %rcx
+; AVX512VL-NEXT: sbbl %edx, %edx
+; AVX512VL-NEXT: kmovd %edx, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT: vmovdqu64 %zmm0, (%rax)
; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshr_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: subq $136, %rsp
-; AVX512VBMI-NEXT: vmovups (%rsi), %ymm1
-; AVX512VBMI-NEXT: vmovups 32(%rsi), %ymm2
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: vmovdqu64 (%rdx), %zmm0
-; AVX512VBMI-NEXT: vmovups 32(%rdx), %ymm3
-; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT: vmovdqu %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm4, (%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT: movl %ecx, %edx
-; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
-; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx def $rcx
; AVX512VBMI-NEXT: andl $511, %ecx # imm = 0x1FF
-; AVX512VBMI-NEXT: movl $512, %esi # imm = 0x200
-; AVX512VBMI-NEXT: subq %rcx, %rsi
-; AVX512VBMI-NEXT: vpbroadcastq %rsi, %zmm2
-; AVX512VBMI-NEXT: # kill: def $esi killed $esi killed $rsi def $rsi
-; AVX512VBMI-NEXT: shrl $3, %esi
-; AVX512VBMI-NEXT: andl $56, %esi
-; AVX512VBMI-NEXT: leaq {{[0-9]+}}(%rsp), %rdi
-; AVX512VBMI-NEXT: subq %rsi, %rdi
-; AVX512VBMI-NEXT: vmovdqu64 (%rdi), %zmm3
-; AVX512VBMI-NEXT: vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm5 = zmm4[7],zmm3[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT: vpshldvq %zmm2, %zmm5, %zmm3
-; AVX512VBMI-NEXT: shrl $3, %edx
-; AVX512VBMI-NEXT: andl $56, %edx
-; AVX512VBMI-NEXT: vmovdqu64 -128(%rsp,%rdx), %zmm2
-; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm4[0]
-; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT: movl $512, %edi # imm = 0x200
+; AVX512VBMI-NEXT: subq %rcx, %rdi
+; AVX512VBMI-NEXT: vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT: # kill: def $edi killed $edi killed $rdi
+; AVX512VBMI-NEXT: shrl $6, %edi
+; AVX512VBMI-NEXT: movl $-1, %r8d
+; AVX512VBMI-NEXT: shlxl %edi, %r8d, %edi
+; AVX512VBMI-NEXT: kmovd %edi, %k1
+; AVX512VBMI-NEXT: vpexpandq (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT: vmovdqu64 (%rdx), %zmm2
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm3[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpshldvq %zmm0, %zmm4, %zmm1
+; AVX512VBMI-NEXT: movl %ecx, %edx
+; AVX512VBMI-NEXT: shrl $6, %edx
+; AVX512VBMI-NEXT: shlxl %edx, %r8d, %edx
+; AVX512VBMI-NEXT: kmovd %edx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm2, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm3 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT: vpshrdvq %zmm4, %zmm3, %zmm0
; AVX512VBMI-NEXT: xorl %edx, %edx
; AVX512VBMI-NEXT: negq %rcx
; AVX512VBMI-NEXT: sbbl %edx, %edx
; AVX512VBMI-NEXT: kmovd %edx, %k1
-; AVX512VBMI-NEXT: vporq %zmm2, %zmm3, %zmm0 {%k1}
-; AVX512VBMI-NEXT: vmovdqu64 %zmm0, (%rax)
-; AVX512VBMI-NEXT: addq $136, %rsp
+; AVX512VBMI-NEXT: vporq %zmm0, %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT: vmovdqu64 %zmm2, (%rax)
; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
@@ -2493,248 +2422,127 @@ define i512 @fshl_rot_i512_load(ptr %p0, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshl_rot_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq 40(%rsi), %r8
-; AVX512F-NEXT: movq 48(%rsi), %rdx
-; AVX512F-NEXT: movq 16(%rsi), %rax
-; AVX512F-NEXT: movq 56(%rsi), %r11
-; AVX512F-NEXT: movq 24(%rsi), %rbx
-; AVX512F-NEXT: movq (%rsi), %r10
-; AVX512F-NEXT: movq 8(%rsi), %r14
-; AVX512F-NEXT: movq 32(%rsi), %r12
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: movq %r12, %r15
-; AVX512F-NEXT: cmovneq %r10, %r15
-; AVX512F-NEXT: movq %rbx, %r13
-; AVX512F-NEXT: cmovneq %r11, %r13
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: cmovneq %rdx, %rsi
-; AVX512F-NEXT: movq %r14, %r9
-; AVX512F-NEXT: cmovneq %r8, %r9
-; AVX512F-NEXT: cmovneq %r12, %r10
-; AVX512F-NEXT: cmovneq %rax, %rdx
-; AVX512F-NEXT: cmovneq %r14, %r8
-; AVX512F-NEXT: cmovneq %rbx, %r11
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: movq %r9, %r14
-; AVX512F-NEXT: cmovneq %r11, %r14
-; AVX512F-NEXT: cmovneq %r8, %r11
-; AVX512F-NEXT: movq %rsi, %rbx
-; AVX512F-NEXT: cmovneq %r10, %rbx
-; AVX512F-NEXT: cmovneq %rdx, %r10
-; AVX512F-NEXT: cmoveq %r13, %r9
-; AVX512F-NEXT: cmoveq %r15, %rsi
-; AVX512F-NEXT: cmovneq %r13, %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: cmovneq %r15, %rdx
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: movq %rdx, %rdi
-; AVX512F-NEXT: cmovneq %r8, %rdi
-; AVX512F-NEXT: cmovneq %rsi, %r8
-; AVX512F-NEXT: cmovneq %r9, %rsi
-; AVX512F-NEXT: cmovneq %rbx, %r9
-; AVX512F-NEXT: cmovneq %r14, %rbx
-; AVX512F-NEXT: cmovneq %r10, %r14
-; AVX512F-NEXT: cmoveq %r11, %rdx
-; AVX512F-NEXT: cmovneq %r11, %r10
-; AVX512F-NEXT: movq %r10, %r11
-; AVX512F-NEXT: shldq %cl, %rdx, %r11
-; AVX512F-NEXT: movq %r14, %r15
-; AVX512F-NEXT: shldq %cl, %r10, %r15
-; AVX512F-NEXT: movq %rbx, %r10
-; AVX512F-NEXT: shldq %cl, %r14, %r10
-; AVX512F-NEXT: movq %r9, %r14
-; AVX512F-NEXT: shldq %cl, %rbx, %r14
-; AVX512F-NEXT: movq %rsi, %rbx
-; AVX512F-NEXT: shldq %cl, %r9, %rbx
-; AVX512F-NEXT: movq %r8, %r9
-; AVX512F-NEXT: shldq %cl, %rsi, %r9
-; AVX512F-NEXT: movq %rdi, %rsi
-; AVX512F-NEXT: shldq %cl, %r8, %rsi
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shldq %cl, %rdi, %rdx
-; AVX512F-NEXT: movq %rdx, 56(%rax)
-; AVX512F-NEXT: movq %rsi, 48(%rax)
-; AVX512F-NEXT: movq %r9, 40(%rax)
-; AVX512F-NEXT: movq %rbx, 32(%rax)
-; AVX512F-NEXT: movq %r14, 24(%rax)
-; AVX512F-NEXT: movq %r10, 16(%rax)
-; AVX512F-NEXT: movq %r15, 8(%rax)
-; AVX512F-NEXT: movq %r11, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm1
+; AVX512F-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
+; AVX512F-NEXT: movl %edx, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %esi
+; AVX512F-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpexpandq %zmm1, %zmm4 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpandn %xmm0, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm4, %zmm4
+; AVX512F-NEXT: vpsrlq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vporq %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: subq %rdx, %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: vpand %xmm0, %xmm3, %xmm4
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm6 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm6[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT: vpaddq %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT: vpsllq %xmm0, %zmm3, %zmm0
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rdx
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm0, %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshl_rot_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq %rdx, %rcx
-; AVX512VL-NEXT: movq 40(%rsi), %r8
-; AVX512VL-NEXT: movq 48(%rsi), %rdx
-; AVX512VL-NEXT: movq 16(%rsi), %rbx
-; AVX512VL-NEXT: movq 56(%rsi), %r10
-; AVX512VL-NEXT: movq 24(%rsi), %r14
-; AVX512VL-NEXT: movq (%rsi), %r9
-; AVX512VL-NEXT: movq 8(%rsi), %r15
-; AVX512VL-NEXT: movq 32(%rsi), %r12
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: movq %r12, %r11
-; AVX512VL-NEXT: cmovneq %r9, %r11
-; AVX512VL-NEXT: movq %r14, %r13
-; AVX512VL-NEXT: cmovneq %r10, %r13
-; AVX512VL-NEXT: movq %rbx, %rax
-; AVX512VL-NEXT: cmovneq %rdx, %rax
-; AVX512VL-NEXT: movq %r15, %rsi
-; AVX512VL-NEXT: cmovneq %r8, %rsi
-; AVX512VL-NEXT: cmovneq %r12, %r9
-; AVX512VL-NEXT: cmovneq %rbx, %rdx
-; AVX512VL-NEXT: cmovneq %r15, %r8
-; AVX512VL-NEXT: cmovneq %r14, %r10
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: movq %rsi, %r14
-; AVX512VL-NEXT: cmovneq %r10, %r14
-; AVX512VL-NEXT: cmovneq %r8, %r10
-; AVX512VL-NEXT: movq %rax, %rbx
-; AVX512VL-NEXT: cmovneq %r9, %rbx
-; AVX512VL-NEXT: cmovneq %rdx, %r9
-; AVX512VL-NEXT: cmoveq %r13, %rsi
-; AVX512VL-NEXT: cmoveq %r11, %rax
-; AVX512VL-NEXT: cmovneq %r13, %r8
-; AVX512VL-NEXT: cmovneq %r11, %rdx
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: movq %rdx, %r11
-; AVX512VL-NEXT: cmovneq %r8, %r11
-; AVX512VL-NEXT: cmovneq %rax, %r8
-; AVX512VL-NEXT: cmovneq %rsi, %rax
-; AVX512VL-NEXT: cmovneq %rbx, %rsi
-; AVX512VL-NEXT: cmovneq %r14, %rbx
-; AVX512VL-NEXT: cmovneq %r9, %r14
-; AVX512VL-NEXT: cmoveq %r10, %rdx
-; AVX512VL-NEXT: cmovneq %r10, %r9
-; AVX512VL-NEXT: movq %r9, %r10
-; AVX512VL-NEXT: shldq %cl, %rdx, %r10
-; AVX512VL-NEXT: movq %r14, %r15
-; AVX512VL-NEXT: shldq %cl, %r9, %r15
-; AVX512VL-NEXT: movq %rbx, %r9
-; AVX512VL-NEXT: shldq %cl, %r14, %r9
-; AVX512VL-NEXT: movq %rsi, %r14
-; AVX512VL-NEXT: shldq %cl, %rbx, %r14
-; AVX512VL-NEXT: movq %rax, %rbx
-; AVX512VL-NEXT: shldq %cl, %rsi, %rbx
-; AVX512VL-NEXT: movq %r8, %rsi
-; AVX512VL-NEXT: shldq %cl, %rax, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: movq %r11, %rdi
-; AVX512VL-NEXT: shldq %cl, %r8, %rdi
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shldq %cl, %r11, %rdx
-; AVX512VL-NEXT: movq %rdx, 56(%rax)
-; AVX512VL-NEXT: movq %rdi, 48(%rax)
-; AVX512VL-NEXT: movq %rsi, 40(%rax)
-; AVX512VL-NEXT: movq %rbx, 32(%rax)
-; AVX512VL-NEXT: movq %r14, 24(%rax)
-; AVX512VL-NEXT: movq %r9, 16(%rax)
-; AVX512VL-NEXT: movq %r15, 8(%rax)
-; AVX512VL-NEXT: movq %r10, (%rax)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %esi
+; AVX512VL-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: valignq {{.*#+}} zmm4 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm4, %zmm4
+; AVX512VL-NEXT: vpsrlq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT: subq %rdx, %rcx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm6 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: valignq {{.*#+}} zmm3 = zmm6[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm3, %zmm3, %zmm3
+; AVX512VL-NEXT: vpsllq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vporq %zmm4, %zmm2, %zmm2
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rdx
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshl_rot_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq %rdx, %rcx
-; AVX512VBMI-NEXT: movq 40(%rsi), %r8
-; AVX512VBMI-NEXT: movq 48(%rsi), %rdx
-; AVX512VBMI-NEXT: movq 16(%rsi), %rbx
-; AVX512VBMI-NEXT: movq 56(%rsi), %r10
-; AVX512VBMI-NEXT: movq 24(%rsi), %r14
-; AVX512VBMI-NEXT: movq (%rsi), %r9
-; AVX512VBMI-NEXT: movq 8(%rsi), %r15
-; AVX512VBMI-NEXT: movq 32(%rsi), %r12
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: movq %r12, %r11
-; AVX512VBMI-NEXT: cmovneq %r9, %r11
-; AVX512VBMI-NEXT: movq %r14, %r13
-; AVX512VBMI-NEXT: cmovneq %r10, %r13
-; AVX512VBMI-NEXT: movq %rbx, %rax
-; AVX512VBMI-NEXT: cmovneq %rdx, %rax
-; AVX512VBMI-NEXT: movq %r15, %rsi
-; AVX512VBMI-NEXT: cmovneq %r8, %rsi
-; AVX512VBMI-NEXT: cmovneq %r12, %r9
-; AVX512VBMI-NEXT: cmovneq %rbx, %rdx
-; AVX512VBMI-NEXT: cmovneq %r15, %r8
-; AVX512VBMI-NEXT: cmovneq %r14, %r10
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: movq %rsi, %r14
-; AVX512VBMI-NEXT: cmovneq %r10, %r14
-; AVX512VBMI-NEXT: cmovneq %r8, %r10
-; AVX512VBMI-NEXT: movq %rax, %rbx
-; AVX512VBMI-NEXT: cmovneq %r9, %rbx
-; AVX512VBMI-NEXT: cmovneq %rdx, %r9
-; AVX512VBMI-NEXT: cmoveq %r13, %rsi
-; AVX512VBMI-NEXT: cmoveq %r11, %rax
-; AVX512VBMI-NEXT: cmovneq %r13, %r8
-; AVX512VBMI-NEXT: cmovneq %r11, %rdx
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: movq %rdx, %r11
-; AVX512VBMI-NEXT: cmovneq %r8, %r11
-; AVX512VBMI-NEXT: cmovneq %rax, %r8
-; AVX512VBMI-NEXT: cmovneq %rsi, %rax
-; AVX512VBMI-NEXT: cmovneq %rbx, %rsi
-; AVX512VBMI-NEXT: cmovneq %r14, %rbx
-; AVX512VBMI-NEXT: cmovneq %r9, %r14
-; AVX512VBMI-NEXT: cmoveq %r10, %rdx
-; AVX512VBMI-NEXT: cmovneq %r10, %r9
-; AVX512VBMI-NEXT: movq %r9, %r10
-; AVX512VBMI-NEXT: shldq %cl, %rdx, %r10
-; AVX512VBMI-NEXT: movq %r14, %r15
-; AVX512VBMI-NEXT: shldq %cl, %r9, %r15
-; AVX512VBMI-NEXT: movq %rbx, %r9
-; AVX512VBMI-NEXT: shldq %cl, %r14, %r9
-; AVX512VBMI-NEXT: movq %rsi, %r14
-; AVX512VBMI-NEXT: shldq %cl, %rbx, %r14
-; AVX512VBMI-NEXT: movq %rax, %rbx
-; AVX512VBMI-NEXT: shldq %cl, %rsi, %rbx
-; AVX512VBMI-NEXT: movq %r8, %rsi
-; AVX512VBMI-NEXT: shldq %cl, %rax, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: movq %r11, %rdi
-; AVX512VBMI-NEXT: shldq %cl, %r8, %rdi
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: movq %rdx, 56(%rax)
-; AVX512VBMI-NEXT: movq %rdi, 48(%rax)
-; AVX512VBMI-NEXT: movq %rsi, 40(%rax)
-; AVX512VBMI-NEXT: movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT: movq %r14, 24(%rax)
-; AVX512VBMI-NEXT: movq %r9, 16(%rax)
-; AVX512VBMI-NEXT: movq %r15, 8(%rax)
-; AVX512VBMI-NEXT: movq %r10, (%rax)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512VBMI-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT: subq %rdx, %rcx
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm5 {%k1} {z}
+; AVX512VBMI-NEXT: vpshrdvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm1 = zmm3[7],zmm5[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm3
+; AVX512VBMI-NEXT: vpshldvq %zmm3, %zmm1, %zmm5
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
+; AVX512VBMI-NEXT: negq %rdx
+; AVX512VBMI-NEXT: sbbl %ecx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vporq %zmm2, %zmm5, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
%r = call i512 @llvm.fshl.i512(i512 %a0, i512 %a0, i512 %a2)
@@ -2896,230 +2704,127 @@ define i512 @fshr_rot_i512_load(ptr %p0, i512 %a2) nounwind {
;
; AVX512F-LABEL: fshr_rot_i512_load:
; AVX512F: # %bb.0:
-; AVX512F-NEXT: pushq %r15
-; AVX512F-NEXT: pushq %r14
-; AVX512F-NEXT: pushq %r13
-; AVX512F-NEXT: pushq %r12
-; AVX512F-NEXT: pushq %rbx
-; AVX512F-NEXT: movq %rdx, %rcx
-; AVX512F-NEXT: movq 40(%rsi), %r8
-; AVX512F-NEXT: movq 48(%rsi), %r9
-; AVX512F-NEXT: movq 16(%rsi), %rax
-; AVX512F-NEXT: movq 56(%rsi), %r14
-; AVX512F-NEXT: movq 24(%rsi), %r11
-; AVX512F-NEXT: movq (%rsi), %rdx
-; AVX512F-NEXT: movq 8(%rsi), %rbx
-; AVX512F-NEXT: movq 32(%rsi), %r12
-; AVX512F-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT: movq %r12, %r15
-; AVX512F-NEXT: cmoveq %rdx, %r15
-; AVX512F-NEXT: movq %r11, %r13
-; AVX512F-NEXT: cmoveq %r14, %r13
-; AVX512F-NEXT: movq %rax, %rsi
-; AVX512F-NEXT: cmoveq %r9, %rsi
-; AVX512F-NEXT: movq %rbx, %r10
-; AVX512F-NEXT: cmoveq %r8, %r10
-; AVX512F-NEXT: cmoveq %r12, %rdx
-; AVX512F-NEXT: cmoveq %rax, %r9
-; AVX512F-NEXT: cmoveq %rbx, %r8
-; AVX512F-NEXT: cmoveq %r11, %r14
-; AVX512F-NEXT: testb $-128, %cl
-; AVX512F-NEXT: movq %r10, %r11
-; AVX512F-NEXT: cmoveq %r14, %r11
-; AVX512F-NEXT: cmoveq %r8, %r14
-; AVX512F-NEXT: movq %rsi, %rbx
-; AVX512F-NEXT: cmoveq %rdx, %rbx
-; AVX512F-NEXT: cmoveq %r9, %rdx
-; AVX512F-NEXT: cmovneq %r13, %r10
-; AVX512F-NEXT: cmovneq %r15, %rsi
-; AVX512F-NEXT: cmoveq %r13, %r8
; AVX512F-NEXT: movq %rdi, %rax
-; AVX512F-NEXT: cmoveq %r15, %r9
-; AVX512F-NEXT: testb $64, %cl
-; AVX512F-NEXT: movq %r9, %rdi
-; AVX512F-NEXT: cmoveq %r8, %rdi
-; AVX512F-NEXT: cmoveq %rsi, %r8
-; AVX512F-NEXT: cmoveq %r10, %rsi
-; AVX512F-NEXT: cmoveq %rbx, %r10
-; AVX512F-NEXT: cmoveq %r11, %rbx
-; AVX512F-NEXT: cmoveq %rdx, %r11
-; AVX512F-NEXT: cmoveq %r14, %rdx
-; AVX512F-NEXT: cmovneq %r14, %r9
-; AVX512F-NEXT: movq %r9, %r14
-; AVX512F-NEXT: shrdq %cl, %rdx, %r14
-; AVX512F-NEXT: shrdq %cl, %r11, %rdx
-; AVX512F-NEXT: shrdq %cl, %rbx, %r11
-; AVX512F-NEXT: shrdq %cl, %r10, %rbx
-; AVX512F-NEXT: shrdq %cl, %rsi, %r10
-; AVX512F-NEXT: shrdq %cl, %r8, %rsi
-; AVX512F-NEXT: shrdq %cl, %rdi, %r8
-; AVX512F-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT: shrdq %cl, %r9, %rdi
-; AVX512F-NEXT: movq %rdi, 56(%rax)
-; AVX512F-NEXT: movq %r8, 48(%rax)
-; AVX512F-NEXT: movq %rsi, 40(%rax)
-; AVX512F-NEXT: movq %r10, 32(%rax)
-; AVX512F-NEXT: movq %rbx, 24(%rax)
-; AVX512F-NEXT: movq %r11, 16(%rax)
-; AVX512F-NEXT: movq %rdx, 8(%rax)
-; AVX512F-NEXT: movq %r14, (%rax)
-; AVX512F-NEXT: popq %rbx
-; AVX512F-NEXT: popq %r12
-; AVX512F-NEXT: popq %r13
-; AVX512F-NEXT: popq %r14
-; AVX512F-NEXT: popq %r15
+; AVX512F-NEXT: vmovdqu64 (%rsi), %zmm1
+; AVX512F-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512F-NEXT: vmovq %rdx, %xmm0
+; AVX512F-NEXT: vpbroadcastq %xmm0, %xmm2
+; AVX512F-NEXT: vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT: vpand %xmm0, %xmm2, %xmm3
+; AVX512F-NEXT: movl %edx, %ecx
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: movl $-1, %esi
+; AVX512F-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpcompressq %zmm1, %zmm4 {%k1} {z}
+; AVX512F-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512F-NEXT: vpandn %xmm0, %xmm2, %xmm2
+; AVX512F-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT: valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT: vpaddq %zmm4, %zmm4, %zmm4
+; AVX512F-NEXT: vpsllq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT: vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT: subq %rdx, %rcx
+; AVX512F-NEXT: vmovq %rcx, %xmm3
+; AVX512F-NEXT: vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT: vpand %xmm0, %xmm3, %xmm4
+; AVX512F-NEXT: shrl $6, %ecx
+; AVX512F-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vpexpandq %zmm1, %zmm6 {%k1} {z}
+; AVX512F-NEXT: vpsllq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT: vpandn %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT: valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512F-NEXT: vpsrlq $1, %zmm3, %zmm3
+; AVX512F-NEXT: vpsrlq %xmm0, %zmm3, %zmm0
+; AVX512F-NEXT: xorl %ecx, %ecx
+; AVX512F-NEXT: negq %rdx
+; AVX512F-NEXT: sbbl %ecx, %ecx
+; AVX512F-NEXT: vporq %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT: kmovw %ecx, %k1
+; AVX512F-NEXT: vporq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT: vmovdqu64 %zmm1, (%rdi)
; AVX512F-NEXT: retq
;
; AVX512VL-LABEL: fshr_rot_i512_load:
; AVX512VL: # %bb.0:
-; AVX512VL-NEXT: pushq %r15
-; AVX512VL-NEXT: pushq %r14
-; AVX512VL-NEXT: pushq %r13
-; AVX512VL-NEXT: pushq %r12
-; AVX512VL-NEXT: pushq %rbx
-; AVX512VL-NEXT: movq %rdx, %rcx
-; AVX512VL-NEXT: movq 40(%rsi), %r8
-; AVX512VL-NEXT: movq 48(%rsi), %r9
-; AVX512VL-NEXT: movq 16(%rsi), %r11
-; AVX512VL-NEXT: movq 56(%rsi), %rax
-; AVX512VL-NEXT: movq 24(%rsi), %rbx
-; AVX512VL-NEXT: movq (%rsi), %rdx
-; AVX512VL-NEXT: movq 8(%rsi), %r15
-; AVX512VL-NEXT: movq 32(%rsi), %r12
-; AVX512VL-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT: movq %r12, %r14
-; AVX512VL-NEXT: cmoveq %rdx, %r14
-; AVX512VL-NEXT: movq %rbx, %r13
-; AVX512VL-NEXT: cmoveq %rax, %r13
-; AVX512VL-NEXT: movq %r11, %rsi
-; AVX512VL-NEXT: cmoveq %r9, %rsi
-; AVX512VL-NEXT: movq %r15, %r10
-; AVX512VL-NEXT: cmoveq %r8, %r10
-; AVX512VL-NEXT: cmoveq %r12, %rdx
-; AVX512VL-NEXT: cmoveq %r11, %r9
-; AVX512VL-NEXT: cmoveq %r15, %r8
-; AVX512VL-NEXT: cmoveq %rbx, %rax
-; AVX512VL-NEXT: testb $-128, %cl
-; AVX512VL-NEXT: movq %r10, %r11
-; AVX512VL-NEXT: cmoveq %rax, %r11
-; AVX512VL-NEXT: cmoveq %r8, %rax
-; AVX512VL-NEXT: movq %rsi, %rbx
-; AVX512VL-NEXT: cmoveq %rdx, %rbx
-; AVX512VL-NEXT: cmoveq %r9, %rdx
-; AVX512VL-NEXT: cmovneq %r13, %r10
-; AVX512VL-NEXT: cmovneq %r14, %rsi
-; AVX512VL-NEXT: cmoveq %r13, %r8
-; AVX512VL-NEXT: cmoveq %r14, %r9
-; AVX512VL-NEXT: testb $64, %cl
-; AVX512VL-NEXT: movq %r9, %r14
-; AVX512VL-NEXT: cmoveq %r8, %r14
-; AVX512VL-NEXT: cmoveq %rsi, %r8
-; AVX512VL-NEXT: cmoveq %r10, %rsi
-; AVX512VL-NEXT: cmoveq %rbx, %r10
-; AVX512VL-NEXT: cmoveq %r11, %rbx
-; AVX512VL-NEXT: cmoveq %rdx, %r11
-; AVX512VL-NEXT: cmoveq %rax, %rdx
-; AVX512VL-NEXT: cmovneq %rax, %r9
-; AVX512VL-NEXT: movq %r9, %r15
-; AVX512VL-NEXT: shrdq %cl, %rdx, %r15
-; AVX512VL-NEXT: shrdq %cl, %r11, %rdx
-; AVX512VL-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VL-NEXT: shrdq %cl, %rsi, %r10
-; AVX512VL-NEXT: shrdq %cl, %r8, %rsi
; AVX512VL-NEXT: movq %rdi, %rax
-; AVX512VL-NEXT: shrdq %cl, %r14, %r8
-; AVX512VL-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT: shrdq %cl, %r9, %r14
-; AVX512VL-NEXT: movq %r14, 56(%rdi)
-; AVX512VL-NEXT: movq %r8, 48(%rdi)
-; AVX512VL-NEXT: movq %rsi, 40(%rdi)
-; AVX512VL-NEXT: movq %r10, 32(%rdi)
-; AVX512VL-NEXT: movq %rbx, 24(%rdi)
-; AVX512VL-NEXT: movq %r11, 16(%rdi)
-; AVX512VL-NEXT: movq %rdx, 8(%rdi)
-; AVX512VL-NEXT: movq %r15, (%rdi)
-; AVX512VL-NEXT: popq %rbx
-; AVX512VL-NEXT: popq %r12
-; AVX512VL-NEXT: popq %r13
-; AVX512VL-NEXT: popq %r14
-; AVX512VL-NEXT: popq %r15
+; AVX512VL-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512VL-NEXT: vpbroadcastq %rdx, %xmm1
+; AVX512VL-NEXT: vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT: vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT: movl %edx, %ecx
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: movl $-1, %esi
+; AVX512VL-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpcompressq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT: vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT: vpandn %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT: vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT: valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT: vpaddq %zmm4, %zmm4, %zmm4
+; AVX512VL-NEXT: vpsllq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT: vporq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT: subq %rdx, %rcx
+; AVX512VL-NEXT: vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT: vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT: shrl $6, %ecx
+; AVX512VL-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vpexpandq %zmm0, %zmm6 {%k1} {z}
+; AVX512VL-NEXT: vpsllq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT: vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT: valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512VL-NEXT: vpsrlq $1, %zmm3, %zmm3
+; AVX512VL-NEXT: vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT: vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT: xorl %ecx, %ecx
+; AVX512VL-NEXT: negq %rdx
+; AVX512VL-NEXT: sbbl %ecx, %ecx
+; AVX512VL-NEXT: kmovd %ecx, %k1
+; AVX512VL-NEXT: vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT: vzeroupper
; AVX512VL-NEXT: retq
;
; AVX512VBMI-LABEL: fshr_rot_i512_load:
; AVX512VBMI: # %bb.0:
-; AVX512VBMI-NEXT: pushq %r15
-; AVX512VBMI-NEXT: pushq %r14
-; AVX512VBMI-NEXT: pushq %r13
-; AVX512VBMI-NEXT: pushq %r12
-; AVX512VBMI-NEXT: pushq %rbx
-; AVX512VBMI-NEXT: movq %rdx, %rcx
-; AVX512VBMI-NEXT: movq 40(%rsi), %r8
-; AVX512VBMI-NEXT: movq 48(%rsi), %r9
-; AVX512VBMI-NEXT: movq 16(%rsi), %r11
-; AVX512VBMI-NEXT: movq 56(%rsi), %rax
-; AVX512VBMI-NEXT: movq 24(%rsi), %rbx
-; AVX512VBMI-NEXT: movq (%rsi), %rdx
-; AVX512VBMI-NEXT: movq 8(%rsi), %r15
-; AVX512VBMI-NEXT: movq 32(%rsi), %r12
-; AVX512VBMI-NEXT: testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT: movq %r12, %r14
-; AVX512VBMI-NEXT: cmoveq %rdx, %r14
-; AVX512VBMI-NEXT: movq %rbx, %r13
-; AVX512VBMI-NEXT: cmoveq %rax, %r13
-; AVX512VBMI-NEXT: movq %r11, %rsi
-; AVX512VBMI-NEXT: cmoveq %r9, %rsi
-; AVX512VBMI-NEXT: movq %r15, %r10
-; AVX512VBMI-NEXT: cmoveq %r8, %r10
-; AVX512VBMI-NEXT: cmoveq %r12, %rdx
-; AVX512VBMI-NEXT: cmoveq %r11, %r9
-; AVX512VBMI-NEXT: cmoveq %r15, %r8
-; AVX512VBMI-NEXT: cmoveq %rbx, %rax
-; AVX512VBMI-NEXT: testb $-128, %cl
-; AVX512VBMI-NEXT: movq %r10, %r11
-; AVX512VBMI-NEXT: cmoveq %rax, %r11
-; AVX512VBMI-NEXT: cmoveq %r8, %rax
-; AVX512VBMI-NEXT: movq %rsi, %rbx
-; AVX512VBMI-NEXT: cmoveq %rdx, %rbx
-; AVX512VBMI-NEXT: cmoveq %r9, %rdx
-; AVX512VBMI-NEXT: cmovneq %r13, %r10
-; AVX512VBMI-NEXT: cmovneq %r14, %rsi
-; AVX512VBMI-NEXT: cmoveq %r13, %r8
-; AVX512VBMI-NEXT: cmoveq %r14, %r9
-; AVX512VBMI-NEXT: testb $64, %cl
-; AVX512VBMI-NEXT: movq %r9, %r14
-; AVX512VBMI-NEXT: cmoveq %r8, %r14
-; AVX512VBMI-NEXT: cmoveq %rsi, %r8
-; AVX512VBMI-NEXT: cmoveq %r10, %rsi
-; AVX512VBMI-NEXT: cmoveq %rbx, %r10
-; AVX512VBMI-NEXT: cmoveq %r11, %rbx
-; AVX512VBMI-NEXT: cmoveq %rdx, %r11
-; AVX512VBMI-NEXT: cmoveq %rax, %rdx
-; AVX512VBMI-NEXT: cmovneq %rax, %r9
-; AVX512VBMI-NEXT: movq %r9, %r15
-; AVX512VBMI-NEXT: shrdq %cl, %rdx, %r15
-; AVX512VBMI-NEXT: shrdq %cl, %r11, %rdx
-; AVX512VBMI-NEXT: shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT: shrdq %cl, %r10, %rbx
-; AVX512VBMI-NEXT: shrdq %cl, %rsi, %r10
-; AVX512VBMI-NEXT: shrdq %cl, %r8, %rsi
; AVX512VBMI-NEXT: movq %rdi, %rax
-; AVX512VBMI-NEXT: shrdq %cl, %r14, %r8
-; AVX512VBMI-NEXT: # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT: shrdq %cl, %r9, %r14
-; AVX512VBMI-NEXT: movq %r14, 56(%rdi)
-; AVX512VBMI-NEXT: movq %r8, 48(%rdi)
-; AVX512VBMI-NEXT: movq %rsi, 40(%rdi)
-; AVX512VBMI-NEXT: movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT: movq %rbx, 24(%rdi)
-; AVX512VBMI-NEXT: movq %r11, 16(%rdi)
-; AVX512VBMI-NEXT: movq %rdx, 8(%rdi)
-; AVX512VBMI-NEXT: movq %r15, (%rdi)
-; AVX512VBMI-NEXT: popq %rbx
-; AVX512VBMI-NEXT: popq %r12
-; AVX512VBMI-NEXT: popq %r13
-; AVX512VBMI-NEXT: popq %r14
-; AVX512VBMI-NEXT: popq %r15
+; AVX512VBMI-NEXT: vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT: andl $511, %edx # imm = 0x1FF
+; AVX512VBMI-NEXT: movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT: subq %rdx, %rcx
+; AVX512VBMI-NEXT: vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT: # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: movl $-1, %esi
+; AVX512VBMI-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT: vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm4 = zmm3[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT: movl %edx, %ecx
+; AVX512VBMI-NEXT: shrl $6, %ecx
+; AVX512VBMI-NEXT: shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vpcompressq %zmm0, %zmm5 {%k1} {z}
+; AVX512VBMI-NEXT: vpshldvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT: valignq {{.*#+}} zmm1 = zmm5[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT: vpbroadcastq %rdx, %zmm3
+; AVX512VBMI-NEXT: vpshrdvq %zmm3, %zmm1, %zmm5
+; AVX512VBMI-NEXT: xorl %ecx, %ecx
+; AVX512VBMI-NEXT: negq %rdx
+; AVX512VBMI-NEXT: sbbl %ecx, %ecx
+; AVX512VBMI-NEXT: kmovd %ecx, %k1
+; AVX512VBMI-NEXT: vporq %zmm5, %zmm2, %zmm0 {%k1}
+; AVX512VBMI-NEXT: vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VBMI-NEXT: vmovdqu %ymm0, (%rdi)
+; AVX512VBMI-NEXT: vzeroupper
; AVX512VBMI-NEXT: retq
%a0 = load i512, ptr %p0
%r = call i512 @llvm.fshr.i512(i512 %a0, i512 %a0, i512 %a2)
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 41f4528bfdc65..e4a21fcebcbe2 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -1482,312 +1482,329 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE4-NEXT: pushq %r13
; SSE4-NEXT: pushq %r12
; SSE4-NEXT: pushq %rbx
-; SSE4-NEXT: subq $120, %rsp
; SSE4-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551615,127]
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT: pand %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE4-NEXT: pand %xmm0, %xmm3
+; SSE4-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE4-NEXT: pand %xmm0, %xmm5
+; SSE4-NEXT: pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
+; SSE4-NEXT: movq %xmm6, %rax
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE4-NEXT: pand %xmm0, %xmm6
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE4-NEXT: pand %xmm0, %xmm7
+; SSE4-NEXT: pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm9
+; SSE4-NEXT: pand %xmm0, %xmm9
+; SSE4-NEXT: pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; SSE4-NEXT: movq %xmm10, %r13
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm10
+; SSE4-NEXT: pand %xmm0, %xmm10
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm11
+; SSE4-NEXT: pand %xmm0, %xmm11
+; SSE4-NEXT: pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3]
+; SSE4-NEXT: movq %xmm12, %rcx
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm13
+; SSE4-NEXT: pand %xmm0, %xmm13
+; SSE4-NEXT: pshufd {{.*#+}} xmm12 = xmm13[2,3,2,3]
+; SSE4-NEXT: movq %xmm12, %r14
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm12
+; SSE4-NEXT: pand %xmm0, %xmm12
+; SSE4-NEXT: pshufd {{.*#+}} xmm14 = xmm12[2,3,2,3]
+; SSE4-NEXT: movq %xmm14, %rbp
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT: pand %xmm0, %xmm14
+; SSE4-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3]
+; SSE4-NEXT: movq %xmm15, %r10
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm15
+; SSE4-NEXT: pand %xmm0, %xmm15
+; SSE4-NEXT: movq %xmm12, %r11
+; SSE4-NEXT: pshufd {{.*#+}} xmm12 = xmm15[2,3,2,3]
+; SSE4-NEXT: movq %xmm14, %rbx
; SSE4-NEXT: andl $127, %edx
; SSE4-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; SSE4-NEXT: andl $127, %r8d
; SSE4-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE4-NEXT: andl $127, %r10d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: andl $127, %eax
-; SSE4-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: andl $127, %ecx
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE4-NEXT: andl $127, %r8d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE4-NEXT: andl $127, %ebx
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT: andl $127, %edx
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT: andl $127, %r13d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT: andl $127, %r11d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT: andl $127, %r14d
; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12
; SSE4-NEXT: andl $127, %r12d
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE4-NEXT: cmpq %rax, %rbp
-; SSE4-NEXT: movq %r12, %r15
-; SSE4-NEXT: sbbq %r14, %r15
-; SSE4-NEXT: setb %r15b
-; SSE4-NEXT: cmpq %rbp, %rax
-; SSE4-NEXT: sbbq %r12, %r14
-; SSE4-NEXT: sbbb $0, %r15b
-; SSE4-NEXT: movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT: cmpq %rax, %r14
-; SSE4-NEXT: movq %r11, %r15
-; SSE4-NEXT: sbbq %r13, %r15
+; SSE4-NEXT: cmpq %r11, %rbx
+; SSE4-NEXT: movq %r10, %r15
+; SSE4-NEXT: sbbq %rbp, %r15
+; SSE4-NEXT: movq %xmm12, %r15
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT: pand %xmm0, %xmm14
+; SSE4-NEXT: pshufd {{.*#+}} xmm12 = xmm14[2,3,2,3]
+; SSE4-NEXT: setb %dl
+; SSE4-NEXT: cmpq %rbx, %r11
+; SSE4-NEXT: movq %xmm12, %r11
+; SSE4-NEXT: pshufd {{.*#+}} xmm12 = xmm10[2,3,2,3]
+; SSE4-NEXT: movq %xmm15, %rbx
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm15
+; SSE4-NEXT: sbbq %r10, %rbp
+; SSE4-NEXT: pand %xmm0, %xmm15
+; SSE4-NEXT: movq %xmm14, %r8
+; SSE4-NEXT: sbbb $0, %dl
+; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: cmpq %rbx, %r8
+; SSE4-NEXT: movq %r11, %r10
+; SSE4-NEXT: sbbq %r15, %r10
+; SSE4-NEXT: pshufd {{.*#+}} xmm14 = xmm15[2,3,2,3]
+; SSE4-NEXT: setb %dl
+; SSE4-NEXT: cmpq %r8, %rbx
+; SSE4-NEXT: movq %xmm14, %r8
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT: pand %xmm0, %xmm14
+; SSE4-NEXT: sbbq %r11, %r15
+; SSE4-NEXT: movq %xmm13, %r10
+; SSE4-NEXT: movq %xmm15, %r11
+; SSE4-NEXT: sbbb $0, %dl
+; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: cmpq %r10, %r11
+; SSE4-NEXT: movq %r8, %rbx
+; SSE4-NEXT: sbbq %r14, %rbx
+; SSE4-NEXT: pshufd {{.*#+}} xmm13 = xmm14[2,3,2,3]
+; SSE4-NEXT: movq %xmm13, %rbx
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm15
+; SSE4-NEXT: pand %xmm0, %xmm15
+; SSE4-NEXT: pshufd {{.*#+}} xmm13 = xmm15[2,3,2,3]
; SSE4-NEXT: setb %bpl
-; SSE4-NEXT: cmpq %r14, %rax
-; SSE4-NEXT: sbbq %r11, %r13
+; SSE4-NEXT: cmpq %r11, %r10
+; SSE4-NEXT: movq %xmm13, %r10
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm13
+; SSE4-NEXT: pand %xmm0, %xmm13
+; SSE4-NEXT: movq %xmm14, %r11
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT: pand %xmm0, %xmm14
+; SSE4-NEXT: sbbq %r8, %r14
+; SSE4-NEXT: movq %xmm15, %rdx
+; SSE4-NEXT: pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3]
; SSE4-NEXT: sbbb $0, %bpl
; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT: cmpq %rax, %r11
-; SSE4-NEXT: movq %rdx, %r14
-; SSE4-NEXT: sbbq %rbx, %r14
+; SSE4-NEXT: cmpq %r11, %rdx
+; SSE4-NEXT: movq %r10, %r8
+; SSE4-NEXT: sbbq %rbx, %r8
; SSE4-NEXT: setb %bpl
-; SSE4-NEXT: cmpq %r11, %rax
-; SSE4-NEXT: sbbq %rdx, %rbx
+; SSE4-NEXT: cmpq %rdx, %r11
+; SSE4-NEXT: movq %xmm15, %rdx
+; SSE4-NEXT: sbbq %r10, %rbx
+; SSE4-NEXT: movq %xmm11, %r8
+; SSE4-NEXT: movq %xmm14, %r10
; SSE4-NEXT: sbbb $0, %bpl
; SSE4-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT: cmpq %rax, %rdx
-; SSE4-NEXT: movq %r8, %r11
+; SSE4-NEXT: cmpq %r8, %r10
+; SSE4-NEXT: movq %rdx, %r11
; SSE4-NEXT: sbbq %rcx, %r11
-; SSE4-NEXT: setb %r11b
-; SSE4-NEXT: cmpq %rdx, %rax
-; SSE4-NEXT: sbbq %r8, %rcx
-; SSE4-NEXT: sbbb $0, %r11b
-; SSE4-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT: pshufd {{.*#+}} xmm11 = xmm13[2,3,2,3]
+; SSE4-NEXT: movq %xmm11, %r11
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm11
+; SSE4-NEXT: pand %xmm0, %xmm11
+; SSE4-NEXT: pshufd {{.*#+}} xmm14 = xmm11[2,3,2,3]
+; SSE4-NEXT: setb %bl
+; SSE4-NEXT: cmpq %r10, %r8
+; SSE4-NEXT: movq %xmm14, %r8
+; SSE4-NEXT: movq %xmm13, %r10
+; SSE4-NEXT: sbbq %rdx, %rcx
+; SSE4-NEXT: movq %xmm11, %rcx
+; SSE4-NEXT: sbbb $0, %bl
+; SSE4-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: cmpq %r10, %rcx
; SSE4-NEXT: movq %r8, %rdx
-; SSE4-NEXT: sbbq %r10, %rdx
-; SSE4-NEXT: setb %dl
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r8, %r10
-; SSE4-NEXT: sbbb $0, %dl
-; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rdx
+; SSE4-NEXT: sbbq %r11, %rdx
+; SSE4-NEXT: setb %bl
+; SSE4-NEXT: cmpq %rcx, %r10
+; SSE4-NEXT: movq %xmm12, %rcx
+; SSE4-NEXT: movq %xmm9, %rdx
+; SSE4-NEXT: sbbq %r8, %r11
+; SSE4-NEXT: movq %xmm10, %r8
+; SSE4-NEXT: sbbb $0, %bl
+; SSE4-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: cmpq %rdx, %r8
+; SSE4-NEXT: movq %rcx, %r10
+; SSE4-NEXT: sbbq %r13, %r10
+; SSE4-NEXT: movq %xmm8, %r11
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm8
+; SSE4-NEXT: pand %xmm0, %xmm8
+; SSE4-NEXT: pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3]
; SSE4-NEXT: setb %r10b
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
+; SSE4-NEXT: cmpq %r8, %rdx
+; SSE4-NEXT: movq %xmm9, %rdx
+; SSE4-NEXT: sbbq %rcx, %r13
+; SSE4-NEXT: movq %xmm7, %rcx
+; SSE4-NEXT: movq %xmm8, %r8
; SSE4-NEXT: sbbb $0, %r10b
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rdx
-; SSE4-NEXT: setb %dl
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
-; SSE4-NEXT: sbbb $0, %dl
-; SSE4-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rdx
-; SSE4-NEXT: setb %bpl
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
-; SSE4-NEXT: sbbb $0, %bpl
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT: movq %r11, %rdx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rdx
+; SSE4-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT: cmpq %rcx, %r8
+; SSE4-NEXT: movq %rdx, %rbx
+; SSE4-NEXT: sbbq %r11, %rbx
+; SSE4-NEXT: pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
+; SSE4-NEXT: setb %r13b
+; SSE4-NEXT: cmpq %r8, %rcx
+; SSE4-NEXT: movq %xmm7, %r8
+; SSE4-NEXT: movq %xmm5, %rbx
+; SSE4-NEXT: sbbq %rdx, %r11
+; SSE4-NEXT: movq %xmm6, %rdx
+; SSE4-NEXT: sbbb $0, %r13b
+; SSE4-NEXT: cmpq %rbx, %rdx
+; SSE4-NEXT: movq %r8, %rcx
+; SSE4-NEXT: sbbq %rax, %rcx
+; SSE4-NEXT: movq %xmm4, %r11
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE4-NEXT: pand %xmm0, %xmm4
+; SSE4-NEXT: pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE4-NEXT: setb %r10b
+; SSE4-NEXT: cmpq %rdx, %rbx
+; SSE4-NEXT: movq %xmm5, %rbx
+; SSE4-NEXT: movq %xmm3, %r15
+; SSE4-NEXT: sbbq %r8, %rax
+; SSE4-NEXT: movq %xmm4, %rax
+; SSE4-NEXT: sbbb $0, %r10b
+; SSE4-NEXT: cmpq %r15, %rax
+; SSE4-NEXT: movq %rbx, %rdx
+; SSE4-NEXT: sbbq %r11, %rdx
; SSE4-NEXT: setb %dl
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r11, %r8
+; SSE4-NEXT: cmpq %rax, %r15
+; SSE4-NEXT: movq %xmm2, %rax
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT: pand %xmm0, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT: movq %xmm3, %r15
+; SSE4-NEXT: sbbq %rbx, %r11
+; SSE4-NEXT: movq %xmm1, %r11
+; SSE4-NEXT: movq %xmm2, %rbx
; SSE4-NEXT: sbbb $0, %dl
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT: movq %r14, %r8
-; SSE4-NEXT: movq (%rsp), %rbx # 8-byte Reload
-; SSE4-NEXT: sbbq %rbx, %r8
+; SSE4-NEXT: cmpq %r11, %rbx
+; SSE4-NEXT: movq %r15, %r8
+; SSE4-NEXT: sbbq %rax, %r8
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT: pand %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT: setb %r8b
+; SSE4-NEXT: cmpq %rbx, %r11
+; SSE4-NEXT: movq %xmm2, %rbx
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT: pand %xmm0, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT: movq %xmm3, %rbp
+; SSE4-NEXT: sbbq %r15, %rax
+; SSE4-NEXT: movq %xmm1, %rax
+; SSE4-NEXT: movq %xmm2, %r15
+; SSE4-NEXT: sbbb $0, %r8b
+; SSE4-NEXT: cmpq %rax, %r15
+; SSE4-NEXT: movq %rbp, %r11
+; SSE4-NEXT: sbbq %rbx, %r11
; SSE4-NEXT: setb %r11b
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r14, %rbx
+; SSE4-NEXT: cmpq %r15, %rax
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT: pand %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT: movq %xmm2, %rax
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT: pand %xmm0, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT: sbbq %rbp, %rbx
+; SSE4-NEXT: movq %xmm1, %r15
+; SSE4-NEXT: movq %xmm2, %rbp
; SSE4-NEXT: sbbb $0, %r11b
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT: cmpq %rax, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT: cmpq %r15, %rbp
+; SSE4-NEXT: movq %xmm3, %r14
; SSE4-NEXT: movq %r14, %rbx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rbx
+; SSE4-NEXT: sbbq %rax, %rbx
; SSE4-NEXT: setb %bl
-; SSE4-NEXT: cmpq %rcx, %rax
-; SSE4-NEXT: sbbq %r14, %r8
+; SSE4-NEXT: cmpq %rbp, %r15
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT: pand %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT: sbbq %r14, %rax
+; SSE4-NEXT: movq %xmm2, %rax
+; SSE4-NEXT: movq %xmm1, %r14
; SSE4-NEXT: sbbb $0, %bl
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT: cmpq %rax, %r14
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE4-NEXT: movq %r15, %rcx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %rcx
-; SSE4-NEXT: setb %cl
-; SSE4-NEXT: cmpq %r14, %rax
-; SSE4-NEXT: sbbq %r15, %r8
-; SSE4-NEXT: sbbb $0, %cl
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE4-NEXT: cmpq %rax, %r15
+; SSE4-NEXT: cmpq %r9, %r14
+; SSE4-NEXT: movq %rax, %r15
+; SSE4-NEXT: sbbq %r12, %r15
+; SSE4-NEXT: setb %bpl
+; SSE4-NEXT: cmpq %r14, %r9
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT: pand %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT: sbbq %rax, %r12
+; SSE4-NEXT: movq %xmm2, %rax
+; SSE4-NEXT: movq %xmm1, %r14
+; SSE4-NEXT: sbbb $0, %bpl
; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT: movq %r12, %r14
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %r14
+; SSE4-NEXT: cmpq %r12, %r14
+; SSE4-NEXT: movq %rax, %r9
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: sbbq %rcx, %r9
+; SSE4-NEXT: movq %rdi, %r15
+; SSE4-NEXT: setb %r9b
+; SSE4-NEXT: cmpq %r14, %r12
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT: pand %xmm0, %xmm1
+; SSE4-NEXT: pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE4-NEXT: pand %xmm0, %xmm3
+; SSE4-NEXT: pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE4-NEXT: movq %xmm4, %r12
+; SSE4-NEXT: sbbq %rax, %rcx
+; SSE4-NEXT: movq %xmm3, %rax
+; SSE4-NEXT: sbbb $0, %r9b
+; SSE4-NEXT: cmpq %rsi, %rax
+; SSE4-NEXT: movq %r12, %rdi
+; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT: sbbq %rcx, %rdi
+; SSE4-NEXT: movq %xmm2, %rdi
+; SSE4-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT: pand %xmm0, %xmm2
+; SSE4-NEXT: pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
; SSE4-NEXT: setb %r14b
-; SSE4-NEXT: cmpq %r15, %rax
-; SSE4-NEXT: sbbq %r12, %r8
+; SSE4-NEXT: cmpq %rax, %rsi
+; SSE4-NEXT: movq %xmm0, %rsi
+; SSE4-NEXT: sbbq %r12, %rcx
+; SSE4-NEXT: movq %xmm1, %r12
+; SSE4-NEXT: movq %xmm2, %rax
; SSE4-NEXT: sbbb $0, %r14b
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: cmpq %r9, %rax
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT: movq %r12, %r15
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %r15
-; SSE4-NEXT: setb %r15b
-; SSE4-NEXT: cmpq %rax, %r9
-; SSE4-NEXT: sbbq %r12, %r8
-; SSE4-NEXT: sbbb $0, %r15b
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
; SSE4-NEXT: cmpq %r12, %rax
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE4-NEXT: movq %r13, %r9
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: sbbq %r8, %r9
-; SSE4-NEXT: setb %r9b
+; SSE4-NEXT: movq %rsi, %rcx
+; SSE4-NEXT: sbbq %rdi, %rcx
+; SSE4-NEXT: setb %cl
; SSE4-NEXT: cmpq %rax, %r12
-; SSE4-NEXT: sbbq %r13, %r8
-; SSE4-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT: sbbb $0, %r9b
-; SSE4-NEXT: cmpq %rsi, %r12
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: movq %r8, %rdi
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT: sbbq %rax, %rdi
-; SSE4-NEXT: setb %dil
-; SSE4-NEXT: cmpq %r12, %rsi
-; SSE4-NEXT: sbbq %r8, %rax
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT: sbbb $0, %dil
-; SSE4-NEXT: cmpq %r12, %r13
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT: movq %r8, %rsi
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT: sbbq %rax, %rsi
-; SSE4-NEXT: setb %sil
-; SSE4-NEXT: cmpq %r13, %r12
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm1
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm2
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm3
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm4
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r12d, %xmm5
-; SSE4-NEXT: movzbl %r10b, %r10d
-; SSE4-NEXT: movd %r10d, %xmm6
-; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SSE4-NEXT: movd %r10d, %xmm7
-; SSE4-NEXT: movzbl %bpl, %r10d
-; SSE4-NEXT: movd %r10d, %xmm0
-; SSE4-NEXT: movzbl %dl, %edx
-; SSE4-NEXT: movd %edx, %xmm8
-; SSE4-NEXT: movzbl %r11b, %edx
-; SSE4-NEXT: movd %edx, %xmm9
-; SSE4-NEXT: movzbl %bl, %edx
-; SSE4-NEXT: movd %edx, %xmm10
-; SSE4-NEXT: movzbl %cl, %ecx
-; SSE4-NEXT: movd %ecx, %xmm11
-; SSE4-NEXT: movzbl %r14b, %ecx
-; SSE4-NEXT: movd %ecx, %xmm12
-; SSE4-NEXT: movzbl %r15b, %ecx
-; SSE4-NEXT: movd %ecx, %xmm13
-; SSE4-NEXT: movzbl %r9b, %ecx
-; SSE4-NEXT: movd %ecx, %xmm14
-; SSE4-NEXT: movzbl %dil, %ecx
-; SSE4-NEXT: movd %ecx, %xmm15
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm1
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm2
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm3
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm4
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm5
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm6
+; SSE4-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT: movd %eax, %xmm7
+; SSE4-NEXT: movzbl %r13b, %eax
+; SSE4-NEXT: movd %eax, %xmm0
+; SSE4-NEXT: movzbl %r10b, %eax
+; SSE4-NEXT: movd %eax, %xmm8
+; SSE4-NEXT: movzbl %dl, %eax
+; SSE4-NEXT: movd %eax, %xmm9
+; SSE4-NEXT: movzbl %r8b, %eax
+; SSE4-NEXT: movd %eax, %xmm10
+; SSE4-NEXT: movzbl %r11b, %eax
+; SSE4-NEXT: movd %eax, %xmm11
+; SSE4-NEXT: movzbl %bl, %eax
+; SSE4-NEXT: movd %eax, %xmm12
+; SSE4-NEXT: movzbl %bpl, %eax
+; SSE4-NEXT: movd %eax, %xmm13
+; SSE4-NEXT: movzbl %r9b, %eax
+; SSE4-NEXT: movd %eax, %xmm14
+; SSE4-NEXT: movzbl %r14b, %eax
+; SSE4-NEXT: movd %eax, %xmm15
; SSE4-NEXT: punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
; SSE4-NEXT: punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
; SSE4-NEXT: punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
@@ -1802,77 +1819,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE4-NEXT: punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
; SSE4-NEXT: punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
; SSE4-NEXT: punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; SSE4-NEXT: sbbq %r8, %rax
-; SSE4-NEXT: sbbb $0, %sil
+; SSE4-NEXT: sbbq %rsi, %rdi
+; SSE4-NEXT: sbbb $0, %cl
; SSE4-NEXT: punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
-; SSE4-NEXT: movzbl %sil, %ecx
-; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT: movb %cl, 4(%rax)
+; SSE4-NEXT: movzbl %cl, %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: movb %al, 4(%r15)
; SSE4-NEXT: movdqa %xmm15, -{{[0-9]+}}(%rsp)
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: leaq (%rcx,%rax,4), %rax
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: shll $4, %ecx
+; SSE4-NEXT: orq %rax, %rcx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shll $6, %eax
+; SSE4-NEXT: orq %rcx, %rax
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: shll $8, %ecx
+; SSE4-NEXT: orq %rax, %rcx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shll $10, %eax
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: leaq (%rdx,%rcx,4), %rcx
+; SSE4-NEXT: shll $12, %edx
+; SSE4-NEXT: orq %rax, %rdx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT: andl $3, %esi
+; SSE4-NEXT: shll $14, %esi
+; SSE4-NEXT: orq %rdx, %rsi
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shll $16, %eax
+; SSE4-NEXT: orq %rsi, %rax
+; SSE4-NEXT: orq %rcx, %rax
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT: andl $3, %ecx
+; SSE4-NEXT: shll $18, %ecx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $4, %edx
+; SSE4-NEXT: shll $20, %edx
; SSE4-NEXT: orq %rcx, %rdx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shll $6, %ecx
+; SSE4-NEXT: shll $22, %ecx
; SSE4-NEXT: orq %rdx, %rcx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $8, %edx
+; SSE4-NEXT: shll $24, %edx
; SSE4-NEXT: orq %rcx, %rdx
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shll $10, %ecx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shll $12, %esi
-; SSE4-NEXT: orq %rcx, %rsi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT: andl $3, %edi
-; SSE4-NEXT: shll $14, %edi
-; SSE4-NEXT: orq %rsi, %rdi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shll $16, %ecx
-; SSE4-NEXT: orq %rdi, %rcx
+; SSE4-NEXT: shlq $26, %rcx
; SSE4-NEXT: orq %rdx, %rcx
+; SSE4-NEXT: orq %rax, %rcx
+; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT: andl $3, %eax
+; SSE4-NEXT: shlq $28, %rax
; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $18, %edx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shll $20, %esi
-; SSE4-NEXT: orq %rdx, %rsi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shll $22, %edx
-; SSE4-NEXT: orq %rsi, %rdx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shll $24, %esi
-; SSE4-NEXT: orq %rdx, %rsi
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT: andl $3, %edx
-; SSE4-NEXT: shlq $26, %rdx
-; SSE4-NEXT: orq %rsi, %rdx
+; SSE4-NEXT: shlq $30, %rdx
+; SSE4-NEXT: orq %rax, %rdx
; SSE4-NEXT: orq %rcx, %rdx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT: andl $3, %ecx
-; SSE4-NEXT: shlq $28, %rcx
-; SSE4-NEXT: movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT: andl $3, %esi
-; SSE4-NEXT: shlq $30, %rsi
-; SSE4-NEXT: orq %rcx, %rsi
-; SSE4-NEXT: orq %rdx, %rsi
-; SSE4-NEXT: movl %esi, (%rax)
-; SSE4-NEXT: addq $120, %rsp
+; SSE4-NEXT: movl %edx, (%r15)
+; SSE4-NEXT: movq %r15, %rax
; SSE4-NEXT: popq %rbx
; SSE4-NEXT: popq %r12
; SSE4-NEXT: popq %r13
@@ -1889,356 +1905,336 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; SSE2-NEXT: pushq %r13
; SSE2-NEXT: pushq %r12
; SSE2-NEXT: pushq %rbx
-; SSE2-NEXT: subq $96, %rsp
-; SSE2-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT: andl $127, %ebx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT: andl $127, %r14d
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT: andl $127, %r15d
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT: andl $127, %r12d
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; SSE2-NEXT: andl $127, %r13d
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT: andl $127, %ebp
+; SSE2-NEXT: movdqa {{.*#+}} xmm0 = [18446744073709551615,127]
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pextrq $1, %xmm3, %rbx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pextrq $1, %xmm4, %r14
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pextrq $1, %xmm5, %r12
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: pextrq $1, %xmm6, %r11
+; SSE2-NEXT: movq %xmm6, %rax
+; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT: andl $127, %r10d
; SSE2-NEXT: andl $127, %edx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT: andl $127, %r11d
-; SSE2-NEXT: movq %r8, %rcx
-; SSE2-NEXT: andl $127, %ecx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: andl $127, %eax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT: cmpq %rsi, %r8
-; SSE2-NEXT: movq %rax, %r10
-; SSE2-NEXT: sbbq %rcx, %r10
-; SSE2-NEXT: setb %r10b
-; SSE2-NEXT: cmpq %r8, %rsi
-; SSE2-NEXT: sbbq %rax, %rcx
+; SSE2-NEXT: andl $127, %r8d
+; SSE2-NEXT: cmpq %rcx, %rax
+; SSE2-NEXT: movq %r11, %r15
+; SSE2-NEXT: sbbq %r8, %r15
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: setb %bpl
+; SSE2-NEXT: cmpq %rax, %rcx
+; SSE2-NEXT: pextrq $1, %xmm6, %rax
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm8
+; SSE2-NEXT: pextrq $1, %xmm8, %r15
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: sbbq %r11, %r8
+; SSE2-NEXT: movq %xmm8, %rcx
+; SSE2-NEXT: sbbb $0, %bpl
+; SSE2-NEXT: movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT: cmpq %rsi, %rcx
+; SSE2-NEXT: movq %r15, %r8
+; SSE2-NEXT: sbbq %rdx, %r8
+; SSE2-NEXT: setb %r8b
+; SSE2-NEXT: cmpq %rcx, %rsi
+; SSE2-NEXT: pextrq $1, %xmm7, %rcx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT: pand %xmm0, %xmm8
+; SSE2-NEXT: pextrq $1, %xmm8, %rsi
+; SSE2-NEXT: sbbq %r15, %rdx
; SSE2-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT: sbbb $0, %r10b
-; SSE2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT: cmpq %r10, %rax
-; SSE2-NEXT: movq %r11, %rcx
-; SSE2-NEXT: sbbq %rdx, %rcx
-; SSE2-NEXT: setb %cl
-; SSE2-NEXT: cmpq %rax, %r10
-; SSE2-NEXT: sbbq %r11, %rdx
-; SSE2-NEXT: sbbb $0, %cl
-; SSE2-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: cmpq %r9, %rax
-; SSE2-NEXT: movq %rbp, %rcx
-; SSE2-NEXT: sbbq %r13, %rcx
+; SSE2-NEXT: movq %xmm8, %rdx
+; SSE2-NEXT: sbbb $0, %r8b
+; SSE2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT: cmpq %r9, %rdx
+; SSE2-NEXT: movq %rsi, %rdi
+; SSE2-NEXT: sbbq %r10, %rdi
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm8
; SSE2-NEXT: setb %dil
-; SSE2-NEXT: cmpq %rax, %r9
-; SSE2-NEXT: sbbq %rbp, %r13
+; SSE2-NEXT: cmpq %rdx, %r9
+; SSE2-NEXT: movq %xmm7, %rdx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: pextrq $1, %xmm7, %r9
+; SSE2-NEXT: pand %xmm0, %xmm8
+; SSE2-NEXT: sbbq %rsi, %r10
+; SSE2-NEXT: movq %xmm7, %rsi
; SSE2-NEXT: sbbb $0, %dil
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq %r12, %r9
-; SSE2-NEXT: sbbq %r15, %r9
-; SSE2-NEXT: setb %r11b
-; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %r12, %r15
-; SSE2-NEXT: sbbb $0, %r11b
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq %r14, %r9
-; SSE2-NEXT: sbbq %rbx, %r9
-; SSE2-NEXT: setb %r9b
-; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %r14, %rbx
-; SSE2-NEXT: sbbb $0, %r9b
-; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT: cmpq %rdx, %rsi
+; SSE2-NEXT: movq %r9, %r10
+; SSE2-NEXT: sbbq %rcx, %r10
+; SSE2-NEXT: pextrq $1, %xmm8, %r10
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: setb %r15b
+; SSE2-NEXT: cmpq %rsi, %rdx
+; SSE2-NEXT: pextrq $1, %xmm7, %rdx
+; SSE2-NEXT: sbbq %r9, %rcx
+; SSE2-NEXT: movq %xmm8, %rcx
+; SSE2-NEXT: movq %xmm7, %rsi
+; SSE2-NEXT: sbbb $0, %r15b
+; SSE2-NEXT: cmpq %rcx, %rsi
+; SSE2-NEXT: movq %rdx, %r9
+; SSE2-NEXT: sbbq %r10, %r9
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT: setb %dil
+; SSE2-NEXT: cmpq %rsi, %rcx
+; SSE2-NEXT: movq %xmm6, %rcx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: pextrq $1, %xmm6, %rsi
+; SSE2-NEXT: pand %xmm0, %xmm7
+; SSE2-NEXT: sbbq %rdx, %r10
+; SSE2-NEXT: movq %xmm6, %rdx
+; SSE2-NEXT: sbbb $0, %dil
+; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT: cmpq %rcx, %rdx
; SSE2-NEXT: movq %rsi, %r9
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT: sbbq %rdx, %r9
-; SSE2-NEXT: setb %r9b
-; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %rsi, %rdx
-; SSE2-NEXT: sbbb $0, %r9b
-; SSE2-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT: sbbq %rax, %r9
+; SSE2-NEXT: pextrq $1, %xmm7, %r9
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: setb %dil
+; SSE2-NEXT: cmpq %rdx, %rcx
+; SSE2-NEXT: pextrq $1, %xmm6, %rcx
+; SSE2-NEXT: sbbq %rsi, %rax
+; SSE2-NEXT: movq %xmm7, %rax
+; SSE2-NEXT: movq %xmm6, %rdx
+; SSE2-NEXT: sbbb $0, %dil
+; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT: cmpq %rax, %rdx
+; SSE2-NEXT: movq %rcx, %rsi
+; SSE2-NEXT: sbbq %r9, %rsi
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT: setb %sil
+; SSE2-NEXT: cmpq %rdx, %rax
+; SSE2-NEXT: movq %xmm5, %rax
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: pextrq $1, %xmm5, %rdx
+; SSE2-NEXT: pand %xmm0, %xmm6
+; SSE2-NEXT: sbbq %rcx, %r9
+; SSE2-NEXT: movq %xmm5, %rcx
+; SSE2-NEXT: sbbb $0, %sil
+; SSE2-NEXT: movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT: movq %r8, %rdx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT: sbbq %rsi, %rdx
-; SSE2-NEXT: setb %dl
+; SSE2-NEXT: movq %rdx, %rsi
+; SSE2-NEXT: sbbq %r12, %rsi
+; SSE2-NEXT: pextrq $1, %xmm6, %rsi
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: setb %dil
; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %r8, %rsi
-; SSE2-NEXT: sbbb $0, %dl
-; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: cmpq %rax, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT: movq %r8, %rdx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT: sbbq %rsi, %rdx
-; SSE2-NEXT: setb %dl
+; SSE2-NEXT: pextrq $1, %xmm5, %rax
+; SSE2-NEXT: sbbq %rdx, %r12
+; SSE2-NEXT: movq %xmm6, %rcx
+; SSE2-NEXT: movq %xmm5, %rdx
+; SSE2-NEXT: sbbb $0, %dil
+; SSE2-NEXT: movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT: cmpq %rcx, %rdx
+; SSE2-NEXT: movq %rax, %r12
+; SSE2-NEXT: sbbq %rsi, %r12
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT: setb %r12b
+; SSE2-NEXT: cmpq %rdx, %rcx
+; SSE2-NEXT: movq %xmm4, %rcx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: pextrq $1, %xmm4, %rdx
+; SSE2-NEXT: pand %xmm0, %xmm5
+; SSE2-NEXT: sbbq %rax, %rsi
+; SSE2-NEXT: movq %xmm4, %rax
+; SSE2-NEXT: sbbb $0, %r12b
; SSE2-NEXT: cmpq %rcx, %rax
-; SSE2-NEXT: sbbq %r8, %rsi
-; SSE2-NEXT: sbbb $0, %dl
-; SSE2-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT: movq %rdx, %rsi
+; SSE2-NEXT: sbbq %r14, %rsi
+; SSE2-NEXT: pextrq $1, %xmm5, %rsi
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: pand %xmm0, %xmm4
+; SSE2-NEXT: setb %r13b
+; SSE2-NEXT: cmpq %rax, %rcx
+; SSE2-NEXT: pextrq $1, %xmm4, %rax
+; SSE2-NEXT: sbbq %rdx, %r14
+; SSE2-NEXT: movq %xmm5, %rcx
+; SSE2-NEXT: movq %xmm4, %rdx
+; SSE2-NEXT: sbbb $0, %r13b
; SSE2-NEXT: cmpq %rcx, %rdx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT: movq %r8, %rax
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT: sbbq %rsi, %rax
-; SSE2-NEXT: setb %r9b
+; SSE2-NEXT: movq %rax, %r14
+; SSE2-NEXT: sbbq %rsi, %r14
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT: setb %r14b
; SSE2-NEXT: cmpq %rdx, %rcx
-; SSE2-NEXT: sbbq %r8, %rsi
-; SSE2-NEXT: sbbb $0, %r9b
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: cmpq %rdx, %rsi
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT: movq %r8, %rcx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT: sbbq %rax, %rcx
-; SSE2-NEXT: setb %cl
-; SSE2-NEXT: cmpq %rsi, %rdx
-; SSE2-NEXT: sbbq %r8, %rax
-; SSE2-NEXT: sbbb $0, %cl
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: cmpq %rsi, %r8
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT: movq %r10, %rdx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT: sbbq %rax, %rdx
-; SSE2-NEXT: setb %dl
-; SSE2-NEXT: cmpq %r8, %rsi
-; SSE2-NEXT: sbbq %r10, %rax
-; SSE2-NEXT: sbbb $0, %dl
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: cmpq %r8, %r10
-; SSE2-NEXT: movq (%rsp), %rbx # 8-byte Reload
-; SSE2-NEXT: movq %rbx, %rsi
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: movq %xmm3, %rdi
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: pextrq $1, %xmm3, %rdx
+; SSE2-NEXT: pand %xmm0, %xmm4
; SSE2-NEXT: sbbq %rax, %rsi
-; SSE2-NEXT: setb %sil
-; SSE2-NEXT: cmpq %r10, %r8
-; SSE2-NEXT: sbbq %rbx, %rax
-; SSE2-NEXT: sbbb $0, %sil
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT: cmpq %r10, %rbx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT: movq %r14, %r8
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT: movq %xmm3, %rax
+; SSE2-NEXT: sbbb $0, %r14b
+; SSE2-NEXT: cmpq %rdi, %rax
+; SSE2-NEXT: movq %rdx, %rsi
+; SSE2-NEXT: sbbq %rbx, %rsi
+; SSE2-NEXT: pextrq $1, %xmm4, %rcx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: setb %bpl
+; SSE2-NEXT: cmpq %rax, %rdi
+; SSE2-NEXT: pextrq $1, %xmm3, %rsi
+; SSE2-NEXT: sbbq %rdx, %rbx
+; SSE2-NEXT: movq %xmm4, %rdx
+; SSE2-NEXT: movq %xmm3, %rdi
+; SSE2-NEXT: sbbb $0, %bpl
+; SSE2-NEXT: cmpq %rdx, %rdi
+; SSE2-NEXT: movq %rsi, %rbx
+; SSE2-NEXT: sbbq %rcx, %rbx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT: setb %bl
+; SSE2-NEXT: cmpq %rdi, %rdx
+; SSE2-NEXT: movq %xmm2, %rdi
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: pextrq $1, %xmm2, %rax
+; SSE2-NEXT: pand %xmm0, %xmm3
+; SSE2-NEXT: sbbq %rsi, %rcx
+; SSE2-NEXT: movq %xmm2, %rcx
+; SSE2-NEXT: sbbb $0, %bl
+; SSE2-NEXT: cmpq %rdi, %rcx
+; SSE2-NEXT: movq %rax, %rdx
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT: sbbq %r8, %rdx
+; SSE2-NEXT: pextrq $1, %xmm3, %rsi
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: setb %r11b
+; SSE2-NEXT: cmpq %rcx, %rdi
+; SSE2-NEXT: pextrq $1, %xmm2, %rdi
; SSE2-NEXT: sbbq %rax, %r8
-; SSE2-NEXT: setb %r8b
-; SSE2-NEXT: cmpq %rbx, %r10
-; SSE2-NEXT: sbbq %r14, %rax
-; SSE2-NEXT: sbbb $0, %r8b
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT: cmpq %rbx, %r14
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT: movq %r15, %r10
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT: sbbq %rax, %r10
+; SSE2-NEXT: movq %xmm3, %rcx
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: sbbb $0, %r11b
+; SSE2-NEXT: cmpq %rcx, %rax
+; SSE2-NEXT: movq %rdi, %rdx
+; SSE2-NEXT: sbbq %rsi, %rdx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm2
; SSE2-NEXT: setb %r10b
-; SSE2-NEXT: cmpq %r14, %rbx
-; SSE2-NEXT: sbbq %r15, %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
+; SSE2-NEXT: cmpq %rax, %rcx
+; SSE2-NEXT: movq %xmm1, %r8
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: pextrq $1, %xmm1, %rdx
+; SSE2-NEXT: pand %xmm0, %xmm2
+; SSE2-NEXT: sbbq %rdi, %rsi
+; SSE2-NEXT: movq %xmm1, %rdi
; SSE2-NEXT: sbbb $0, %r10b
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT: cmpq %r14, %r15
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT: movq %r12, %rbx
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT: sbbq %rax, %rbx
-; SSE2-NEXT: setb %bl
-; SSE2-NEXT: cmpq %r15, %r14
-; SSE2-NEXT: sbbq %r12, %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT: sbbb $0, %bl
-; SSE2-NEXT: cmpq %r14, %r15
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT: movq %r13, %r12
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT: sbbq %rax, %r12
-; SSE2-NEXT: setb %bpl
-; SSE2-NEXT: cmpq %r15, %r14
-; SSE2-NEXT: sbbq %r13, %rax
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT: sbbb $0, %bpl
-; SSE2-NEXT: cmpq %r14, %r15
+; SSE2-NEXT: cmpq %r8, %rdi
+; SSE2-NEXT: movq %rdx, %rsi
; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT: movq %rax, %r12
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT: sbbq %r13, %r12
-; SSE2-NEXT: setb %r12b
-; SSE2-NEXT: cmpq %r15, %r14
-; SSE2-NEXT: sbbq %rax, %r13
-; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
-; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
-; SSE2-NEXT: movzbl %dil, %r14d
-; SSE2-NEXT: movd %r13d, %xmm0
-; SSE2-NEXT: movzbl %r11b, %edi
-; SSE2-NEXT: sbbb $0, %r12b
-; SSE2-NEXT: movzbl %r12b, %r11d
-; SSE2-NEXT: pinsrb $1, %r15d, %xmm0
-; SSE2-NEXT: pinsrb $2, %r14d, %xmm0
+; SSE2-NEXT: sbbq %rax, %rsi
+; SSE2-NEXT: pextrq $1, %xmm2, %rcx
+; SSE2-NEXT: movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT: pand %xmm0, %xmm1
+; SSE2-NEXT: setb %r9b
+; SSE2-NEXT: cmpq %rdi, %r8
+; SSE2-NEXT: pextrq $1, %xmm1, %rsi
+; SSE2-NEXT: sbbq %rdx, %rax
+; SSE2-NEXT: movq %xmm2, %rax
+; SSE2-NEXT: movq %xmm1, %rdi
+; SSE2-NEXT: sbbb $0, %r9b
+; SSE2-NEXT: cmpq %rax, %rdi
+; SSE2-NEXT: movq %rsi, %rdx
+; SSE2-NEXT: sbbq %rcx, %rdx
+; SSE2-NEXT: setb %dl
+; SSE2-NEXT: cmpq %rdi, %rax
+; SSE2-NEXT: sbbq %rsi, %rcx
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT: movd %ecx, %xmm0
+; SSE2-NEXT: movzbl %r15b, %edi
+; SSE2-NEXT: sbbb $0, %dl
+; SSE2-NEXT: movzbl %dl, %ecx
+; SSE2-NEXT: pinsrb $1, %eax, %xmm0
+; SSE2-NEXT: pinsrb $2, %r8d, %xmm0
; SSE2-NEXT: pinsrb $3, %edi, %xmm0
-; SSE2-NEXT: andl $3, %r11d
-; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT: movb %r11b, 4(%r12)
-; SSE2-NEXT: movd %xmm0, %r11d
-; SSE2-NEXT: andl $3, %r11d
-; SSE2-NEXT: andl $3, %r15d
-; SSE2-NEXT: leal (%r11,%r15,4), %r11d
-; SSE2-NEXT: andl $3, %r14d
-; SSE2-NEXT: shll $4, %r14d
-; SSE2-NEXT: orl %r11d, %r14d
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT: movb %cl, 4(%r15)
+; SSE2-NEXT: movd %xmm0, %ecx
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: andl $3, %eax
+; SSE2-NEXT: leal (%rcx,%rax,4), %eax
+; SSE2-NEXT: andl $3, %r8d
+; SSE2-NEXT: shll $4, %r8d
+; SSE2-NEXT: orl %eax, %r8d
; SSE2-NEXT: andl $3, %edi
; SSE2-NEXT: shll $6, %edi
-; SSE2-NEXT: orl %r14d, %edi
-; SSE2-NEXT: movzbl %bpl, %r11d
-; SSE2-NEXT: andl $3, %r11d
-; SSE2-NEXT: shll $8, %r11d
-; SSE2-NEXT: orl %edi, %r11d
-; SSE2-NEXT: movzbl %bl, %edi
-; SSE2-NEXT: andl $3, %edi
-; SSE2-NEXT: shll $10, %edi
-; SSE2-NEXT: orl %r11d, %edi
-; SSE2-NEXT: movzbl %r8b, %r8d
-; SSE2-NEXT: movzbl %r10b, %r10d
-; SSE2-NEXT: andl $3, %r10d
-; SSE2-NEXT: shll $12, %r10d
-; SSE2-NEXT: andl $3, %r8d
-; SSE2-NEXT: shll $14, %r8d
-; SSE2-NEXT: orl %r10d, %r8d
-; SSE2-NEXT: movzbl %sil, %esi
-; SSE2-NEXT: andl $3, %esi
-; SSE2-NEXT: shll $16, %esi
-; SSE2-NEXT: orl %r8d, %esi
-; SSE2-NEXT: movzbl %dl, %edx
+; SSE2-NEXT: orl %r8d, %edi
+; SSE2-NEXT: movzbl %r9b, %ecx
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: shll $8, %ecx
+; SSE2-NEXT: orl %edi, %ecx
+; SSE2-NEXT: movzbl %r10b, %eax
+; SSE2-NEXT: andl $3, %eax
+; SSE2-NEXT: shll $10, %eax
+; SSE2-NEXT: orl %ecx, %eax
+; SSE2-NEXT: movzbl %bl, %ecx
+; SSE2-NEXT: movzbl %r11b, %edx
; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: shll $18, %edx
-; SSE2-NEXT: orl %esi, %edx
-; SSE2-NEXT: movzbl %cl, %ecx
+; SSE2-NEXT: shll $12, %edx
; SSE2-NEXT: andl $3, %ecx
-; SSE2-NEXT: shll $20, %ecx
+; SSE2-NEXT: shll $14, %ecx
; SSE2-NEXT: orl %edx, %ecx
-; SSE2-NEXT: movzbl %r9b, %edx
+; SSE2-NEXT: movzbl %bpl, %edx
; SSE2-NEXT: andl $3, %edx
-; SSE2-NEXT: shll $22, %edx
+; SSE2-NEXT: shll $16, %edx
; SSE2-NEXT: orl %ecx, %edx
-; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SSE2-NEXT: andl $3, %eax
-; SSE2-NEXT: shll $24, %eax
-; SSE2-NEXT: orl %edx, %eax
-; SSE2-NEXT: orl %edi, %eax
+; SSE2-NEXT: movzbl %r14b, %ecx
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: shll $18, %ecx
+; SSE2-NEXT: orl %edx, %ecx
+; SSE2-NEXT: movzbl %r13b, %edx
+; SSE2-NEXT: andl $3, %edx
+; SSE2-NEXT: shll $20, %edx
+; SSE2-NEXT: orl %ecx, %edx
+; SSE2-NEXT: movzbl %r12b, %esi
+; SSE2-NEXT: andl $3, %esi
+; SSE2-NEXT: shll $22, %esi
+; SSE2-NEXT: orl %edx, %esi
; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT: andl $3, %ecx
+; SSE2-NEXT: shll $24, %ecx
+; SSE2-NEXT: orl %esi, %ecx
+; SSE2-NEXT: orl %eax, %ecx
+; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
; SSE2-NEXT: andl $3, %edx
; SSE2-NEXT: shlq $26, %rdx
-; SSE2-NEXT: andl $3, %ecx
-; SSE2-NEXT: shlq $28, %rcx
-; SSE2-NEXT: orq %rdx, %rcx
+; SSE2-NEXT: andl $3, %eax
+; SSE2-NEXT: shlq $28, %rax
+; SSE2-NEXT: orq %rdx, %rax
; SSE2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
; SSE2-NEXT: andl $3, %edx
; SSE2-NEXT: shlq $30, %rdx
-; SSE2-NEXT: orq %rcx, %rdx
; SSE2-NEXT: orq %rax, %rdx
-; SSE2-NEXT: movq %r12, %rax
-; SSE2-NEXT: movl %edx, (%r12)
-; SSE2-NEXT: addq $96, %rsp
+; SSE2-NEXT: orq %rcx, %rdx
+; SSE2-NEXT: movq %r15, %rax
+; SSE2-NEXT: movl %edx, (%r15)
; SSE2-NEXT: popq %rbx
; SSE2-NEXT: popq %r12
; SSE2-NEXT: popq %r13
@@ -2255,319 +2251,272 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX2-NEXT: pushq %r13
; AVX2-NEXT: pushq %r12
; AVX2-NEXT: pushq %rbx
-; AVX2-NEXT: subq $88, %rsp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,127]
; AVX2-NEXT: andl $127, %r8d
; AVX2-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX2-NEXT: andl $127, %edx
; AVX2-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT: andl $127, %r15d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: andl $127, %eax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX2-NEXT: andl $127, %r14d
; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %r10
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX2-NEXT: vpextrq $1, %xmm4, %r11
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX2-NEXT: vpextrq $1, %xmm6, %rbx
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm7
+; AVX2-NEXT: vpextrq $1, %xmm7, %r14
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
; AVX2-NEXT: andl $127, %edx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX2-NEXT: andl $127, %ebp
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: andl $127, %r8d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT: andl $127, %r12d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT: andl $127, %r13d
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: cmpq %rbx, %r11
-; AVX2-NEXT: movq %r13, %r10
-; AVX2-NEXT: sbbq %r12, %r10
-; AVX2-NEXT: setb %r10b
-; AVX2-NEXT: cmpq %r11, %rbx
-; AVX2-NEXT: sbbq %r13, %r12
-; AVX2-NEXT: sbbb $0, %r10b
-; AVX2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: cmpq %r10, %r11
-; AVX2-NEXT: movq %r8, %rbx
-; AVX2-NEXT: sbbq %rbp, %rbx
-; AVX2-NEXT: setb %bl
-; AVX2-NEXT: cmpq %r11, %r10
-; AVX2-NEXT: sbbq %r8, %rbp
-; AVX2-NEXT: sbbb $0, %bl
-; AVX2-NEXT: movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: cmpq %r8, %r10
-; AVX2-NEXT: movq %rdx, %r11
-; AVX2-NEXT: sbbq %r14, %r11
-; AVX2-NEXT: setb %r11b
-; AVX2-NEXT: cmpq %r10, %r8
-; AVX2-NEXT: sbbq %rdx, %r14
-; AVX2-NEXT: sbbb $0, %r11b
-; AVX2-NEXT: movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT: cmpq %rdx, %r8
-; AVX2-NEXT: movq %rax, %r10
-; AVX2-NEXT: sbbq %r15, %r10
-; AVX2-NEXT: setb %r10b
-; AVX2-NEXT: cmpq %r8, %rdx
+; AVX2-NEXT: vpextrq $1, %xmm8, %r15
+; AVX2-NEXT: vmovq %xmm8, %r12
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
+; AVX2-NEXT: vpextrq $1, %xmm8, %rax
+; AVX2-NEXT: vmovq %xmm8, %rbp
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
+; AVX2-NEXT: cmpq %r12, %rbp
+; AVX2-NEXT: movq %rax, %r13
+; AVX2-NEXT: sbbq %r15, %r13
+; AVX2-NEXT: vpextrq $1, %xmm8, %r13
+; AVX2-NEXT: setb %r8b
+; AVX2-NEXT: cmpq %rbp, %r12
+; AVX2-NEXT: vmovq %xmm8, %r12
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
+; AVX2-NEXT: vpextrq $1, %xmm8, %rbp
; AVX2-NEXT: sbbq %rax, %r15
-; AVX2-NEXT: sbbb $0, %r10b
-; AVX2-NEXT: movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: cmpq %rax, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: movq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: sbbq %r10, %r8
+; AVX2-NEXT: sbbb $0, %r8b
+; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT: vmovq %xmm8, %rax
+; AVX2-NEXT: cmpq %r12, %rax
+; AVX2-NEXT: movq %rbp, %r15
+; AVX2-NEXT: sbbq %r13, %r15
; AVX2-NEXT: setb %r8b
-; AVX2-NEXT: cmpq %rdx, %rax
-; AVX2-NEXT: sbbq %r11, %r10
+; AVX2-NEXT: cmpq %rax, %r12
+; AVX2-NEXT: vmovq %xmm7, %rax
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm7
+; AVX2-NEXT: vpextrq $1, %xmm7, %r15
+; AVX2-NEXT: sbbq %rbp, %r13
; AVX2-NEXT: sbbb $0, %r8b
; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: cmpq %rax, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: movq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: sbbq %r10, %r8
+; AVX2-NEXT: vmovq %xmm7, %r12
+; AVX2-NEXT: cmpq %rax, %r12
+; AVX2-NEXT: movq %r15, %r13
+; AVX2-NEXT: sbbq %r14, %r13
+; AVX2-NEXT: vmovq %xmm6, %r13
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
; AVX2-NEXT: setb %r8b
-; AVX2-NEXT: cmpq %rdx, %rax
-; AVX2-NEXT: sbbq %r11, %r10
+; AVX2-NEXT: cmpq %r12, %rax
+; AVX2-NEXT: vpextrq $1, %xmm6, %rax
+; AVX2-NEXT: vmovq %xmm6, %r12
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX2-NEXT: sbbq %r15, %r14
+; AVX2-NEXT: vpextrq $1, %xmm6, %r14
; AVX2-NEXT: sbbb $0, %r8b
; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: cmpq %rax, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: movq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: sbbq %r10, %r8
+; AVX2-NEXT: cmpq %r13, %r12
+; AVX2-NEXT: movq %rax, %r15
+; AVX2-NEXT: sbbq %rbx, %r15
; AVX2-NEXT: setb %r8b
-; AVX2-NEXT: cmpq %rdx, %rax
-; AVX2-NEXT: sbbq %r11, %r10
+; AVX2-NEXT: cmpq %r12, %r13
+; AVX2-NEXT: vmovq %xmm6, %r15
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX2-NEXT: vpextrq $1, %xmm6, %r12
+; AVX2-NEXT: sbbq %rax, %rbx
+; AVX2-NEXT: vmovq %xmm6, %rax
; AVX2-NEXT: sbbb $0, %r8b
; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: cmpq %rax, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: movq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: sbbq %r10, %r8
-; AVX2-NEXT: setb %r12b
-; AVX2-NEXT: cmpq %rdx, %rax
-; AVX2-NEXT: sbbq %r11, %r10
-; AVX2-NEXT: sbbb $0, %r12b
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT: cmpq %rax, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: movq %r11, %r8
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT: sbbq %r10, %r8
+; AVX2-NEXT: cmpq %r15, %rax
+; AVX2-NEXT: movq %r12, %rbx
+; AVX2-NEXT: sbbq %r14, %rbx
+; AVX2-NEXT: vpextrq $1, %xmm5, %rbx
; AVX2-NEXT: setb %r8b
-; AVX2-NEXT: cmpq %rdx, %rax
-; AVX2-NEXT: sbbq %r11, %r10
+; AVX2-NEXT: cmpq %rax, %r15
+; AVX2-NEXT: vmovq %xmm5, %rax
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX2-NEXT: vpextrq $1, %xmm5, %r15
+; AVX2-NEXT: sbbq %r12, %r14
+; AVX2-NEXT: sbbb $0, %r8b
+; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT: vmovq %xmm5, %r14
+; AVX2-NEXT: cmpq %rax, %r14
+; AVX2-NEXT: movq %r15, %r12
+; AVX2-NEXT: sbbq %rbx, %r12
+; AVX2-NEXT: setb %r8b
+; AVX2-NEXT: cmpq %r14, %rax
+; AVX2-NEXT: vmovq %xmm4, %rax
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX2-NEXT: vpextrq $1, %xmm4, %r14
+; AVX2-NEXT: sbbq %r15, %rbx
; AVX2-NEXT: sbbb $0, %r8b
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT: cmpq %rax, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: movq %rbx, %rdx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT: sbbq %r11, %rdx
-; AVX2-NEXT: setb %dl
-; AVX2-NEXT: cmpq %r10, %rax
-; AVX2-NEXT: sbbq %rbx, %r11
-; AVX2-NEXT: sbbb $0, %dl
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT: cmpq %rax, %r11
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT: vmovq %xmm4, %rbx
+; AVX2-NEXT: cmpq %rax, %rbx
+; AVX2-NEXT: movq %r14, %r15
+; AVX2-NEXT: sbbq %r11, %r15
+; AVX2-NEXT: vmovq %xmm3, %r15
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT: setb %r8b
+; AVX2-NEXT: cmpq %rbx, %rax
+; AVX2-NEXT: vpextrq $1, %xmm3, %rax
+; AVX2-NEXT: vmovq %xmm3, %rbx
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT: sbbq %r14, %r11
+; AVX2-NEXT: vpextrq $1, %xmm3, %r11
+; AVX2-NEXT: sbbb $0, %r8b
+; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT: cmpq %r15, %rbx
+; AVX2-NEXT: movq %rax, %r14
+; AVX2-NEXT: sbbq %r10, %r14
+; AVX2-NEXT: setb %r8b
+; AVX2-NEXT: cmpq %rbx, %r15
+; AVX2-NEXT: vmovq %xmm3, %rbx
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT: vpextrq $1, %xmm3, %r14
+; AVX2-NEXT: sbbq %rax, %r10
+; AVX2-NEXT: vmovq %xmm3, %rax
+; AVX2-NEXT: sbbb $0, %r8b
+; AVX2-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT: cmpq %rbx, %rax
; AVX2-NEXT: movq %r14, %r10
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT: sbbq %rbx, %r10
+; AVX2-NEXT: sbbq %r11, %r10
+; AVX2-NEXT: vpextrq $1, %xmm2, %r15
; AVX2-NEXT: setb %r10b
-; AVX2-NEXT: cmpq %r11, %rax
-; AVX2-NEXT: sbbq %r14, %rbx
-; AVX2-NEXT: sbbb $0, %r10b
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rbx
; AVX2-NEXT: cmpq %rax, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: movq %r15, %r11
-; AVX2-NEXT: movq (%rsp), %r14 # 8-byte Reload
+; AVX2-NEXT: vmovq %xmm2, %rax
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rbx
; AVX2-NEXT: sbbq %r14, %r11
+; AVX2-NEXT: sbbb $0, %r10b
+; AVX2-NEXT: vmovq %xmm2, %r14
+; AVX2-NEXT: cmpq %rax, %r14
+; AVX2-NEXT: movq %rbx, %r11
+; AVX2-NEXT: sbbq %r15, %r11
+; AVX2-NEXT: vpextrq $1, %xmm1, %rbp
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
; AVX2-NEXT: setb %r11b
-; AVX2-NEXT: cmpq %rbx, %rax
-; AVX2-NEXT: sbbq %r15, %r14
+; AVX2-NEXT: vpextrq $1, %xmm2, %r12
+; AVX2-NEXT: cmpq %r14, %rax
+; AVX2-NEXT: sbbq %rbx, %r15
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm2, %r14
; AVX2-NEXT: sbbb $0, %r11b
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %r14
; AVX2-NEXT: cmpq %rax, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq %r13, %rbx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: sbbq %r15, %rbx
+; AVX2-NEXT: movq %r12, %rbx
+; AVX2-NEXT: sbbq %rbp, %rbx
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
; AVX2-NEXT: setb %bl
+; AVX2-NEXT: vpextrq $1, %xmm1, %r15
+; AVX2-NEXT: vpextrq $1, %xmm2, %r13
; AVX2-NEXT: cmpq %r14, %rax
-; AVX2-NEXT: sbbq %r13, %r15
+; AVX2-NEXT: sbbq %r12, %rbp
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm2, %r14
; AVX2-NEXT: sbbb $0, %bl
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: cmpq %r9, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq %r13, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: sbbq %r15, %r14
+; AVX2-NEXT: cmpq %rax, %r14
+; AVX2-NEXT: movq %r13, %r12
+; AVX2-NEXT: sbbq %r15, %r12
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
; AVX2-NEXT: setb %bpl
-; AVX2-NEXT: cmpq %rax, %r9
+; AVX2-NEXT: cmpq %r14, %rax
; AVX2-NEXT: sbbq %r13, %r15
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: vmovq %xmm2, %r15
; AVX2-NEXT: sbbb $0, %bpl
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT: cmpq %rax, %r15
+; AVX2-NEXT: vpextrq $1, %xmm1, %r12
+; AVX2-NEXT: vpextrq $1, %xmm2, %r13
+; AVX2-NEXT: movq %r13, %r14
+; AVX2-NEXT: sbbq %r12, %r14
+; AVX2-NEXT: setb %r14b
+; AVX2-NEXT: cmpq %r15, %rax
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: sbbq %r13, %r12
+; AVX2-NEXT: vpextrq $1, %xmm1, %rax
+; AVX2-NEXT: sbbb $0, %r14b
+; AVX2-NEXT: vmovq %xmm1, %r12
+; AVX2-NEXT: cmpq %r9, %r12
+; AVX2-NEXT: movq %rax, %r15
+; AVX2-NEXT: sbbq %rdx, %r15
+; AVX2-NEXT: setb %r15b
+; AVX2-NEXT: cmpq %r12, %r9
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vpextrq $1, %xmm1, %r12
+; AVX2-NEXT: sbbq %rax, %rdx
+; AVX2-NEXT: vmovq %xmm1, %rax
+; AVX2-NEXT: sbbb $0, %r15b
; AVX2-NEXT: cmpq %rsi, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: movq %r15, %r9
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: sbbq %r14, %r9
+; AVX2-NEXT: movq %r12, %r8
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT: sbbq %rdx, %r8
; AVX2-NEXT: setb %r9b
; AVX2-NEXT: cmpq %rax, %rsi
-; AVX2-NEXT: sbbq %r15, %r14
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX2-NEXT: vpextrq $1, %xmm2, %rax
+; AVX2-NEXT: sbbq %r12, %rdx
+; AVX2-NEXT: vmovq %xmm2, %rdx
; AVX2-NEXT: sbbb $0, %r9b
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: cmpq %rcx, %rax
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: movq %r15, %rsi
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT: sbbq %r14, %rsi
-; AVX2-NEXT: setb %sil
-; AVX2-NEXT: cmpq %rax, %rcx
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT: sbbq %r15, %r14
-; AVX2-NEXT: sbbb $0, %sil
-; AVX2-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT: cmpq %rax, %rcx
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT: movq %r13, %r14
-; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT: sbbq %r15, %r14
-; AVX2-NEXT: setb %r14b
-; AVX2-NEXT: cmpq %rcx, %rax
-; AVX2-NEXT: sbbq %r13, %r15
+; AVX2-NEXT: cmpq %rcx, %rdx
+; AVX2-NEXT: movq %rax, %rsi
+; AVX2-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT: sbbq %r12, %rsi
+; AVX2-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT: setb %r8b
+; AVX2-NEXT: cmpq %rdx, %rcx
+; AVX2-NEXT: vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT: vmovq %xmm1, %rdx
+; AVX2-NEXT: sbbq %rax, %r12
+; AVX2-NEXT: sbbb $0, %r8b
+; AVX2-NEXT: vmovq %xmm0, %rax
+; AVX2-NEXT: cmpq %rdx, %rax
+; AVX2-NEXT: movq %rcx, %r12
+; AVX2-NEXT: sbbq %rsi, %r12
+; AVX2-NEXT: setb %r12b
+; AVX2-NEXT: cmpq %rax, %rdx
+; AVX2-NEXT: sbbq %rcx, %rsi
; AVX2-NEXT: movq %rdi, %rax
-; AVX2-NEXT: sbbb $0, %r14b
-; AVX2-NEXT: movzbl %r14b, %ecx
+; AVX2-NEXT: sbbb $0, %r12b
+; AVX2-NEXT: movzbl %r12b, %ecx
; AVX2-NEXT: andl $3, %ecx
; AVX2-NEXT: movb %cl, 4(%rdi)
-; AVX2-NEXT: movzbl %sil, %ecx
+; AVX2-NEXT: movzbl %r8b, %ecx
; AVX2-NEXT: andl $3, %ecx
-; AVX2-NEXT: movzbl %r9b, %esi
-; AVX2-NEXT: andl $3, %esi
-; AVX2-NEXT: leaq (%rsi,%rcx,4), %rcx
-; AVX2-NEXT: movzbl %bpl, %esi
-; AVX2-NEXT: andl $3, %esi
-; AVX2-NEXT: shll $4, %esi
-; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: movzbl %bl, %ecx
+; AVX2-NEXT: movzbl %r9b, %edx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: leaq (%rdx,%rcx,4), %rcx
+; AVX2-NEXT: movzbl %r15b, %edx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: shll $4, %edx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: movzbl %r14b, %ecx
; AVX2-NEXT: andl $3, %ecx
; AVX2-NEXT: shll $6, %ecx
-; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
+; AVX2-NEXT: movzbl %bpl, %edx
+; AVX2-NEXT: andl $3, %edx
+; AVX2-NEXT: shll $8, %edx
+; AVX2-NEXT: orq %rcx, %rdx
+; AVX2-NEXT: movzbl %bl, %ecx
+; AVX2-NEXT: andl $3, %ecx
+; AVX2-NEXT: shll $10, %ecx
; AVX2-NEXT: movzbl %r11b, %esi
; AVX2-NEXT: andl $3, %esi
-; AVX2-NEXT: shll $8, %esi
+; AVX2-NEXT: shll $12, %esi
; AVX2-NEXT: orq %rcx, %rsi
-; AVX2-NEXT: movzbl %r10b, %ecx
-; AVX2-NEXT: andl $3, %ecx
-; AVX2-NEXT: shll $10, %ecx
-; AVX2-NEXT: movzbl %dl, %edx
-; AVX2-NEXT: andl $3, %edx
-; AVX2-NEXT: shll $12, %edx
-; AVX2-NEXT: orq %rcx, %rdx
-; AVX2-NEXT: movzbl %r8b, %edi
+; AVX2-NEXT: movzbl %r10b, %edi
; AVX2-NEXT: andl $3, %edi
; AVX2-NEXT: shll $14, %edi
-; AVX2-NEXT: orq %rdx, %rdi
-; AVX2-NEXT: movzbl %r12b, %ecx
+; AVX2-NEXT: orq %rsi, %rdi
+; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; AVX2-NEXT: andl $3, %ecx
; AVX2-NEXT: shll $16, %ecx
; AVX2-NEXT: orq %rdi, %rcx
-; AVX2-NEXT: orq %rsi, %rcx
+; AVX2-NEXT: orq %rdx, %rcx
; AVX2-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
; AVX2-NEXT: andl $3, %edx
; AVX2-NEXT: shll $18, %edx
@@ -2597,7 +2546,6 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX2-NEXT: orq %rcx, %rsi
; AVX2-NEXT: orq %rdx, %rsi
; AVX2-NEXT: movl %esi, (%rax)
-; AVX2-NEXT: addq $88, %rsp
; AVX2-NEXT: popq %rbx
; AVX2-NEXT: popq %r12
; AVX2-NEXT: popq %r13
@@ -2614,318 +2562,277 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: pushq %r13
; AVX512-NEXT: pushq %r12
; AVX512-NEXT: pushq %rbx
-; AVX512-NEXT: subq $88, %rsp
; AVX512-NEXT: movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq %r8, %r15
; AVX512-NEXT: movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT: movq %rdx, %r14
; AVX512-NEXT: movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
; AVX512-NEXT: movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andl $127, %r8d
-; AVX512-NEXT: movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: andl $127, %edx
-; AVX512-NEXT: movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, (%rsp) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbp
-; AVX512-NEXT: andl $127, %ebp
+; AVX512-NEXT: vmovdqa {{.*#+}} xmm0 = [18446744073709551615,127]
+; AVX512-NEXT: andl $127, %r15d
+; AVX512-NEXT: andl $127, %r14d
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rcx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT: vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT: vmovq %xmm6, %rdi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
; AVX512-NEXT: andl $127, %r12d
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r13
-; AVX512-NEXT: andl $127, %r13d
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r15
-; AVX512-NEXT: andl $127, %r15d
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: andl $127, %r10d
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT: andl $127, %ebx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: andl $127, %r8d
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT: andl $127, %r9d
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT: andl $127, %esi
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT: andl $127, %edi
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: andl $127, %eax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: andl $127, %edx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: cmpq %r14, %r11
-; AVX512-NEXT: movq %rdx, %rcx
-; AVX512-NEXT: sbbq %rax, %rcx
-; AVX512-NEXT: setb %cl
-; AVX512-NEXT: cmpq %r11, %r14
-; AVX512-NEXT: sbbq %rdx, %rax
-; AVX512-NEXT: sbbb $0, %cl
-; AVX512-NEXT: movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %rdi, %rdx
-; AVX512-NEXT: sbbq %rsi, %rdx
-; AVX512-NEXT: setb %dl
-; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %rdi, %rsi
-; AVX512-NEXT: sbbb $0, %dl
-; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %r9, %rdx
-; AVX512-NEXT: sbbq %r8, %rdx
-; AVX512-NEXT: setb %dl
-; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %r9, %r8
-; AVX512-NEXT: sbbb $0, %dl
-; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %rbx, %rdx
-; AVX512-NEXT: sbbq %r10, %rdx
-; AVX512-NEXT: setb %dl
-; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %rbx, %r10
-; AVX512-NEXT: sbbb $0, %dl
-; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %r15, %rdx
-; AVX512-NEXT: sbbq %r13, %rdx
-; AVX512-NEXT: setb %dl
-; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %r15, %r13
-; AVX512-NEXT: sbbb $0, %dl
-; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq %r12, %rdx
-; AVX512-NEXT: sbbq %rbp, %rdx
-; AVX512-NEXT: setb %dl
-; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %r12, %rbp
-; AVX512-NEXT: sbbb $0, %dl
-; AVX512-NEXT: movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: movq %rdi, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: sbbq %rsi, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm6, %r8
+; AVX512-NEXT: vmovq %xmm6, %r11
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX512-NEXT: cmpq %rdi, %r11
+; AVX512-NEXT: movq %r8, %rbx
+; AVX512-NEXT: sbbq %rsi, %rbx
+; AVX512-NEXT: setb %r9b
+; AVX512-NEXT: cmpq %r11, %rdi
+; AVX512-NEXT: vpextrq $1, %xmm6, %rdi
+; AVX512-NEXT: vmovq %xmm5, %r11
+; AVX512-NEXT: sbbq %r8, %rsi
+; AVX512-NEXT: vmovq %xmm6, %rsi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX512-NEXT: sbbb $0, %r9b
+; AVX512-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT: cmpq %r11, %rsi
+; AVX512-NEXT: movq %rdi, %r8
+; AVX512-NEXT: sbbq %rdx, %r8
+; AVX512-NEXT: vpextrq $1, %xmm5, %r8
+; AVX512-NEXT: setb %r9b
+; AVX512-NEXT: cmpq %rsi, %r11
+; AVX512-NEXT: vmovq %xmm5, %rsi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX512-NEXT: vpextrq $1, %xmm5, %rbx
+; AVX512-NEXT: sbbq %rdi, %rdx
+; AVX512-NEXT: vmovq %xmm5, %rdx
+; AVX512-NEXT: sbbb $0, %r9b
+; AVX512-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT: cmpq %rsi, %rdx
+; AVX512-NEXT: movq %rbx, %rdi
+; AVX512-NEXT: sbbq %r8, %rdi
+; AVX512-NEXT: setb %r9b
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: vmovq %xmm4, %rdx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX512-NEXT: vpextrq $1, %xmm4, %rsi
+; AVX512-NEXT: sbbq %rbx, %r8
+; AVX512-NEXT: vmovq %xmm4, %rdi
+; AVX512-NEXT: sbbb $0, %r9b
+; AVX512-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT: cmpq %rdx, %rdi
+; AVX512-NEXT: movq %rsi, %r8
+; AVX512-NEXT: sbbq %rcx, %r8
+; AVX512-NEXT: vpextrq $1, %xmm3, %r8
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX512-NEXT: setb %r9b
+; AVX512-NEXT: cmpq %rdi, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT: vmovq %xmm3, %rdi
+; AVX512-NEXT: sbbq %rsi, %rcx
+; AVX512-NEXT: vmovq %xmm4, %rcx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX512-NEXT: sbbb $0, %r9b
+; AVX512-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT: cmpq %rdi, %rcx
+; AVX512-NEXT: movq %rdx, %rsi
+; AVX512-NEXT: sbbq %r8, %rsi
+; AVX512-NEXT: setb %r9b
+; AVX512-NEXT: cmpq %rcx, %rdi
+; AVX512-NEXT: vpextrq $1, %xmm3, %rcx
+; AVX512-NEXT: vmovq %xmm2, %rsi
+; AVX512-NEXT: sbbq %rdx, %r8
+; AVX512-NEXT: vmovq %xmm3, %rdx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: sbbb $0, %r9b
+; AVX512-NEXT: movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT: cmpq %rsi, %rdx
+; AVX512-NEXT: movq %rcx, %rdi
+; AVX512-NEXT: sbbq %rax, %rdi
+; AVX512-NEXT: vpextrq $1, %xmm2, %rdi
+; AVX512-NEXT: setb %r8b
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: vmovq %xmm2, %rdx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
+; AVX512-NEXT: sbbq %rcx, %rax
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: sbbb $0, %r8b
+; AVX512-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT: cmpq %rdx, %rax
+; AVX512-NEXT: movq %rsi, %rcx
+; AVX512-NEXT: sbbq %rdi, %rcx
+; AVX512-NEXT: setb %r8b
+; AVX512-NEXT: cmpq %rax, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT: sbbq %rsi, %rdi
+; AVX512-NEXT: vmovq %xmm1, %rdx
+; AVX512-NEXT: vmovq %xmm2, %rsi
+; AVX512-NEXT: sbbb $0, %r8b
+; AVX512-NEXT: movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: movq %rcx, %rdi
+; AVX512-NEXT: sbbq %rax, %rdi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
; AVX512-NEXT: setb %r13b
-; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %rdi, %rsi
+; AVX512-NEXT: cmpq %rsi, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rsi
+; AVX512-NEXT: sbbq %rcx, %rax
+; AVX512-NEXT: vmovq %xmm1, %rax
+; AVX512-NEXT: vmovq %xmm2, %rcx
; AVX512-NEXT: sbbb $0, %r13b
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; AVX512-NEXT: cmpq %rax, %rcx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: movq %rdi, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: sbbq %rsi, %rdx
+; AVX512-NEXT: movq %rsi, %rdi
+; AVX512-NEXT: sbbq %rdx, %rdi
; AVX512-NEXT: setb %bpl
; AVX512-NEXT: cmpq %rcx, %rax
-; AVX512-NEXT: sbbq %rdi, %rsi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: vpextrq $1, %xmm1, %rax
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: sbbq %rsi, %rdx
+; AVX512-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT: vmovq %xmm1, %rdx
; AVX512-NEXT: sbbb $0, %bpl
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: cmpq %rcx, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: movq %rdi, %rax
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT: sbbq %rsi, %rax
-; AVX512-NEXT: setb %r9b
-; AVX512-NEXT: cmpq %rdx, %rcx
-; AVX512-NEXT: sbbq %rdi, %rsi
-; AVX512-NEXT: sbbb $0, %r9b
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT: vmovq %xmm2, %rsi
; AVX512-NEXT: cmpq %rdx, %rsi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT: movq %rdi, %rcx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: sbbq %rax, %rcx
-; AVX512-NEXT: setb %cl
+; AVX512-NEXT: movq %rcx, %rdi
+; AVX512-NEXT: sbbq %rax, %rdi
+; AVX512-NEXT: setb %bl
; AVX512-NEXT: cmpq %rsi, %rdx
-; AVX512-NEXT: sbbq %rdi, %rax
-; AVX512-NEXT: sbbb $0, %cl
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT: cmpq %rsi, %rdi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT: movq %r8, %rdx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: sbbq %rcx, %rax
+; AVX512-NEXT: vmovq %xmm1, %rsi
+; AVX512-NEXT: vmovq %xmm2, %rcx
+; AVX512-NEXT: sbbb $0, %bl
+; AVX512-NEXT: cmpq %rsi, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512-NEXT: movq %rax, %rdi
+; AVX512-NEXT: sbbq %rdx, %rdi
+; AVX512-NEXT: setb %r11b
+; AVX512-NEXT: cmpq %rcx, %rsi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: vpextrq $1, %xmm1, %rcx
; AVX512-NEXT: sbbq %rax, %rdx
-; AVX512-NEXT: setb %dl
-; AVX512-NEXT: cmpq %rdi, %rsi
-; AVX512-NEXT: sbbq %r8, %rax
-; AVX512-NEXT: sbbb $0, %dl
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: cmpq %rdi, %r8
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT: movq %r10, %rsi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: sbbq %rax, %rsi
-; AVX512-NEXT: setb %sil
-; AVX512-NEXT: cmpq %r8, %rdi
-; AVX512-NEXT: sbbq %r10, %rax
-; AVX512-NEXT: sbbb $0, %sil
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: cmpq %r8, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT: movq %r11, %rdi
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: sbbq %rax, %rdi
-; AVX512-NEXT: setb %dil
-; AVX512-NEXT: cmpq %r10, %r8
-; AVX512-NEXT: sbbq %r11, %rax
-; AVX512-NEXT: sbbb $0, %dil
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: cmpq %rax, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: movq %rbx, %r8
-; AVX512-NEXT: movq (%rsp), %r11 # 8-byte Reload
-; AVX512-NEXT: sbbq %r11, %r8
-; AVX512-NEXT: setb %r8b
-; AVX512-NEXT: cmpq %r10, %rax
-; AVX512-NEXT: sbbq %rbx, %r11
-; AVX512-NEXT: sbbb $0, %r8b
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT: cmpq %rbx, %r11
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: movq %r14, %r10
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: sbbq %rax, %r10
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: vmovq %xmm1, %rdx
+; AVX512-NEXT: sbbb $0, %r11b
+; AVX512-NEXT: vmovq %xmm2, %rsi
+; AVX512-NEXT: cmpq %rdx, %rsi
+; AVX512-NEXT: vpextrq $1, %xmm2, %rax
+; AVX512-NEXT: movq %rax, %rdi
+; AVX512-NEXT: sbbq %rcx, %rdi
; AVX512-NEXT: setb %r10b
-; AVX512-NEXT: cmpq %r11, %rbx
-; AVX512-NEXT: sbbq %r14, %rax
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT: cmpq %rsi, %rdx
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: sbbq %rax, %rcx
+; AVX512-NEXT: vpextrq $1, %xmm2, %rcx
; AVX512-NEXT: sbbb $0, %r10b
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT: cmpq %r15, %r11
+; AVX512-NEXT: vmovq %xmm1, %rsi
+; AVX512-NEXT: vmovq %xmm2, %rax
+; AVX512-NEXT: cmpq %rsi, %rax
+; AVX512-NEXT: movq %rcx, %rdi
+; AVX512-NEXT: sbbq %rdx, %rdi
+; AVX512-NEXT: setb %r9b
+; AVX512-NEXT: cmpq %rax, %rsi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: sbbq %rcx, %rdx
+; AVX512-NEXT: vmovq %xmm1, %rsi
+; AVX512-NEXT: sbbb $0, %r9b
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT: cmpq %rcx, %rsi
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT: movq %rdx, %rax
+; AVX512-NEXT: sbbq %r12, %rax
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: setb %r8b
+; AVX512-NEXT: vpextrq $1, %xmm1, %rdi
+; AVX512-NEXT: cmpq %rsi, %rcx
+; AVX512-NEXT: sbbq %rdx, %r12
+; AVX512-NEXT: vmovq %xmm1, %rcx
+; AVX512-NEXT: sbbb $0, %r8b
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, %rbx
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: sbbq %r14, %rbx
-; AVX512-NEXT: setb %bl
-; AVX512-NEXT: cmpq %r11, %r15
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT: sbbq %rax, %r14
-; AVX512-NEXT: movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT: sbbb $0, %bl
-; AVX512-NEXT: cmpq %r11, %r14
+; AVX512-NEXT: cmpq %rax, %rcx
+; AVX512-NEXT: movq %rdi, %rdx
+; AVX512-NEXT: sbbq %r14, %rdx
+; AVX512-NEXT: setb %r12b
+; AVX512-NEXT: cmpq %rcx, %rax
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT: vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT: sbbq %rdi, %r14
+; AVX512-NEXT: vmovq %xmm2, %rdx
+; AVX512-NEXT: sbbb $0, %r12b
; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT: movq %rax, %r15
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT: sbbq %r12, %r15
+; AVX512-NEXT: cmpq %rax, %rdx
+; AVX512-NEXT: movq %rcx, %rsi
+; AVX512-NEXT: sbbq %r15, %rsi
+; AVX512-NEXT: vpextrq $1, %xmm1, %rsi
+; AVX512-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX512-NEXT: setb %r14b
+; AVX512-NEXT: cmpq %rdx, %rax
+; AVX512-NEXT: vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT: vmovq %xmm1, %rax
+; AVX512-NEXT: sbbq %rcx, %r15
+; AVX512-NEXT: vmovq %xmm0, %rcx
+; AVX512-NEXT: sbbb $0, %r14b
+; AVX512-NEXT: cmpq %rax, %rcx
+; AVX512-NEXT: movq %rdx, %r15
+; AVX512-NEXT: sbbq %rsi, %r15
; AVX512-NEXT: setb %r15b
-; AVX512-NEXT: cmpq %r14, %r11
-; AVX512-NEXT: sbbq %rax, %r12
+; AVX512-NEXT: cmpq %rcx, %rax
+; AVX512-NEXT: sbbq %rdx, %rsi
; AVX512-NEXT: sbbb $0, %r15b
-; AVX512-NEXT: movzbl %r15b, %r11d
-; AVX512-NEXT: andl $3, %r11d
-; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT: movb %r11b, 4(%r14)
-; AVX512-NEXT: movzbl %bl, %r11d
-; AVX512-NEXT: andl $3, %r11d
-; AVX512-NEXT: movzbl %r10b, %r10d
-; AVX512-NEXT: andl $3, %r10d
-; AVX512-NEXT: leaq (%r10,%r11,4), %r10
-; AVX512-NEXT: movzbl %r8b, %r8d
-; AVX512-NEXT: andl $3, %r8d
-; AVX512-NEXT: shll $4, %r8d
-; AVX512-NEXT: orq %r10, %r8
-; AVX512-NEXT: movzbl %dil, %edi
-; AVX512-NEXT: andl $3, %edi
-; AVX512-NEXT: shll $6, %edi
-; AVX512-NEXT: orq %r8, %rdi
-; AVX512-NEXT: movzbl %sil, %esi
-; AVX512-NEXT: andl $3, %esi
-; AVX512-NEXT: shll $8, %esi
-; AVX512-NEXT: orq %rdi, %rsi
-; AVX512-NEXT: movzbl %dl, %edx
-; AVX512-NEXT: andl $3, %edx
-; AVX512-NEXT: shll $10, %edx
-; AVX512-NEXT: movzbl %cl, %ecx
+; AVX512-NEXT: movzbl %r15b, %eax
+; AVX512-NEXT: andl $3, %eax
+; AVX512-NEXT: movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT: movb %al, 4(%r15)
+; AVX512-NEXT: movzbl %r14b, %eax
+; AVX512-NEXT: andl $3, %eax
+; AVX512-NEXT: movzbl %r12b, %ecx
; AVX512-NEXT: andl $3, %ecx
-; AVX512-NEXT: shll $12, %ecx
-; AVX512-NEXT: orq %rdx, %rcx
-; AVX512-NEXT: movzbl %r9b, %edx
+; AVX512-NEXT: leaq (%rcx,%rax,4), %rax
+; AVX512-NEXT: movzbl %r8b, %ecx
+; AVX512-NEXT: andl $3, %ecx
+; AVX512-NEXT: shll $4, %ecx
+; AVX512-NEXT: orq %rax, %rcx
+; AVX512-NEXT: movzbl %r9b, %eax
+; AVX512-NEXT: andl $3, %eax
+; AVX512-NEXT: shll $6, %eax
+; AVX512-NEXT: orq %rcx, %rax
+; AVX512-NEXT: movzbl %r10b, %ecx
+; AVX512-NEXT: andl $3, %ecx
+; AVX512-NEXT: shll $8, %ecx
+; AVX512-NEXT: orq %rax, %rcx
+; AVX512-NEXT: movzbl %r11b, %eax
+; AVX512-NEXT: andl $3, %eax
+; AVX512-NEXT: shll $10, %eax
+; AVX512-NEXT: movzbl %bl, %edx
; AVX512-NEXT: andl $3, %edx
-; AVX512-NEXT: shll $14, %edx
-; AVX512-NEXT: orq %rcx, %rdx
-; AVX512-NEXT: movzbl %bpl, %eax
+; AVX512-NEXT: shll $12, %edx
+; AVX512-NEXT: orq %rax, %rdx
+; AVX512-NEXT: movzbl %bpl, %esi
+; AVX512-NEXT: andl $3, %esi
+; AVX512-NEXT: shll $14, %esi
+; AVX512-NEXT: orq %rdx, %rsi
+; AVX512-NEXT: movzbl %r13b, %eax
; AVX512-NEXT: andl $3, %eax
; AVX512-NEXT: shll $16, %eax
-; AVX512-NEXT: orq %rdx, %rax
; AVX512-NEXT: orq %rsi, %rax
-; AVX512-NEXT: movzbl %r13b, %ecx
+; AVX512-NEXT: orq %rcx, %rax
+; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
; AVX512-NEXT: andl $3, %ecx
; AVX512-NEXT: shll $18, %ecx
; AVX512-NEXT: movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
@@ -2953,9 +2860,8 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
; AVX512-NEXT: shlq $30, %rdx
; AVX512-NEXT: orq %rax, %rdx
; AVX512-NEXT: orq %rcx, %rdx
-; AVX512-NEXT: movq %r14, %rax
-; AVX512-NEXT: movl %edx, (%r14)
-; AVX512-NEXT: addq $88, %rsp
+; AVX512-NEXT: movq %r15, %rax
+; AVX512-NEXT: movl %edx, (%r15)
; AVX512-NEXT: popq %rbx
; AVX512-NEXT: popq %r12
; AVX512-NEXT: popq %r13
>From c58c6a0aadf249e3795653c4e8060bd0bfe0d156 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Fri, 13 Mar 2026 10:50:35 +0000
Subject: [PATCH 8/8] update test
---
llvm/test/CodeGen/X86/pr173924.ll | 33 +++++++++++++++----------------
1 file changed, 16 insertions(+), 17 deletions(-)
diff --git a/llvm/test/CodeGen/X86/pr173924.ll b/llvm/test/CodeGen/X86/pr173924.ll
index 17c048c05a7de..a25f62a0ab071 100644
--- a/llvm/test/CodeGen/X86/pr173924.ll
+++ b/llvm/test/CodeGen/X86/pr173924.ll
@@ -6,30 +6,29 @@ define i256 @PR173924(<8 x i256> %a0) {
; CHECK-LABEL: PR173924:
; CHECK: # %bb.0:
; CHECK-NEXT: movq %rdi, %rax
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r8
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r8
; CHECK-NEXT: movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT: andl $1, %r10d
-; CHECK-NEXT: andl $1, %r9d
-; CHECK-NEXT: addq %r10, %r9
; CHECK-NEXT: vmovd {{.*#+}} xmm0 = [1,0,0,0]
-; CHECK-NEXT: vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
-; CHECK-NEXT: vmovq %xmm1, %r10
-; CHECK-NEXT: andl $1, %edx
-; CHECK-NEXT: addq %r10, %rdx
-; CHECK-NEXT: addq %r9, %rdx
-; CHECK-NEXT: andl $1, %r8d
+; CHECK-NEXT: vpand {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; CHECK-NEXT: vmovq %xmm0, %r11
+; CHECK-NEXT: andl $1, %r10d
; CHECK-NEXT: andl $1, %esi
-; CHECK-NEXT: addq %r8, %rsi
-; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: addq %r10, %rsi
+; CHECK-NEXT: andl $1, %r8d
; CHECK-NEXT: andl $1, %ecx
-; CHECK-NEXT: addq %rdi, %rcx
+; CHECK-NEXT: addq %r8, %rcx
; CHECK-NEXT: addq %rsi, %rcx
-; CHECK-NEXT: addq %rdx, %rcx
-; CHECK-NEXT: vmovq %rcx, %xmm1
-; CHECK-NEXT: vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT: andl $1, %edx
+; CHECK-NEXT: addq %r11, %rdx
+; CHECK-NEXT: andl $1, %edi
+; CHECK-NEXT: andl $1, %r9d
+; CHECK-NEXT: addq %rdi, %r9
+; CHECK-NEXT: addq %rdx, %r9
+; CHECK-NEXT: addq %rcx, %r9
+; CHECK-NEXT: vmovq %r9, %xmm0
; CHECK-NEXT: vmovdqu %ymm0, (%rax)
; CHECK-NEXT: vzeroupper
; CHECK-NEXT: retq
More information about the llvm-commits
mailing list