[llvm] [X86] Remove single use assumption in combineVectorSizedSetCCEquality (PR #182200)

Gergo Stomfai via llvm-commits llvm-commits at lists.llvm.org
Fri Mar 13 03:51:01 PDT 2026


https://github.com/stomfaig updated https://github.com/llvm/llvm-project/pull/182200

>From 1abcc7d8bd5f0ee89ee1d05ad7d6975e1b9969a6 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 00:23:56 +0000
Subject: [PATCH 1/8] add check for all parents of load for foldability

---
 llvm/lib/Target/X86/X86ISelLowering.cpp      | 36 +++++++++---
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 59 +++++++-------------
 llvm/test/CodeGen/X86/pr166534.ll            | 47 +++++++++-------
 3 files changed, 77 insertions(+), 65 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 927a49b203968..aefeb4bf38912 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2959,14 +2959,36 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
     case ISD::SUB:
     case ISD::FSHL:
     case ISD::FSHR:
-      return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
-             mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
+      return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget, AssumeSingleUse) &&
+             mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse);
     case ISD::SELECT:
-      return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget) &&
-             mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget);
+      return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse) &&
+             mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget, AssumeSingleUse);
     }
   }
-  return X86::mayFoldLoad(Op, Subtarget, AssumeSingleUse,
+  if (!ISD::isNormalLoad(Op.getNode()))
+    return false;
+
+  // Single-use loads just check the load itself
+  if (AssumeSingleUse || Op.hasOneUse())
+    return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
+                            /*IgnoreAlignment=*/true);
+
+  for (SDUse &Use : Op->uses()) {
+    if (Use.getResNo() != 0)
+      continue;
+
+    SDNode *User = Use.getUser();
+    if (ISD::isNormalStore(User))
+      continue;
+
+    if (!mayFoldIntoVector(SDValue(User, 0), DAG, Subtarget,
+                           /*AssumeSingleUse=*/true))
+      return false;
+  }
+
+  // All users are vectorizable, now check the load itself
+  return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/false,
                           /*IgnoreAlignment=*/true);
 }
 
@@ -23540,8 +23562,8 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
 
   // Don't perform this combine if constructing the vector will be expensive.
   // TODO: Drop AssumeSingleUse = true override.
-  if ((!mayFoldIntoVector(X, DAG, Subtarget, /*AssumeSingleUse=*/true) ||
-       !mayFoldIntoVector(Y, DAG, Subtarget, /*AssumeSingleUse=*/true)) &&
+  if ((!mayFoldIntoVector(X, DAG, Subtarget) ||
+       !mayFoldIntoVector(Y, DAG, Subtarget)) &&
       !IsOrXorXorTreeCCZero)
     return SDValue();
 
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 7070848e3fe3e..4980bc89ae74a 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1199,45 +1199,26 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: chain_reset_i256:
-; SSE:       # %bb.0:
-; SSE-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; SSE-NEXT:    movl $-2, %eax
-; SSE-NEXT:    roll %cl, %eax
-; SSE-NEXT:    shrl $3, %ecx
-; SSE-NEXT:    andl $28, %ecx
-; SSE-NEXT:    andl %eax, (%rdi,%rcx)
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %r8
-; SSE-NEXT:    orq 24(%rdi), %r8
-; SSE-NEXT:    movq 16(%rdi), %rdi
-; SSE-NEXT:    orq %rcx, %rdi
-; SSE-NEXT:    movl (%rsi), %eax
-; SSE-NEXT:    movl %ecx, (%rsi)
-; SSE-NEXT:    movl (%rdx), %ecx
-; SSE-NEXT:    addl %ecx, %eax
-; SSE-NEXT:    orq %r8, %rdi
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    retq
-;
-; AVX-LABEL: chain_reset_i256:
-; AVX:       # %bb.0:
-; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; AVX-NEXT:    movl $-2, %eax
-; AVX-NEXT:    roll %cl, %eax
-; AVX-NEXT:    shrl $3, %ecx
-; AVX-NEXT:    andl $28, %ecx
-; AVX-NEXT:    andl %eax, (%rdi,%rcx)
-; AVX-NEXT:    vmovdqu (%rdi), %ymm0
-; AVX-NEXT:    movl (%rdi), %ecx
-; AVX-NEXT:    movl (%rsi), %eax
-; AVX-NEXT:    movl %ecx, (%rsi)
-; AVX-NEXT:    movl (%rdx), %ecx
-; AVX-NEXT:    addl %ecx, %eax
-; AVX-NEXT:    vptest %ymm0, %ymm0
-; AVX-NEXT:    cmovnel %ecx, %eax
-; AVX-NEXT:    vzeroupper
-; AVX-NEXT:    retq
+; X64-LABEL: chain_reset_i256:
+; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; X64-NEXT:    movl $-2, %eax
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    shrl $3, %ecx
+; X64-NEXT:    andl $28, %ecx
+; X64-NEXT:    andl %eax, (%rdi,%rcx)
+; X64-NEXT:    movq (%rdi), %rcx
+; X64-NEXT:    movq 8(%rdi), %r8
+; X64-NEXT:    orq 24(%rdi), %r8
+; X64-NEXT:    movq 16(%rdi), %rdi
+; X64-NEXT:    orq %rcx, %rdi
+; X64-NEXT:    movl (%rsi), %eax
+; X64-NEXT:    movl %ecx, (%rsi)
+; X64-NEXT:    movl (%rdx), %ecx
+; X64-NEXT:    addl %ecx, %eax
+; X64-NEXT:    orq %r8, %rdi
+; X64-NEXT:    cmovnel %ecx, %eax
+; X64-NEXT:    retq
   %rem = and i32 %position, 255
   %ofs = zext nneg i32 %rem to i256
   %bit = shl nuw i256 1, %ofs
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
index 162a0c93bfcf4..fc31fe9ce4bd6 100644
--- a/llvm/test/CodeGen/X86/pr166534.ll
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -7,15 +7,16 @@
 define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ; SSE2-LABEL: pr166534:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movdqu (%rdi), %xmm0
-; SSE2-NEXT:    movdqu (%rsi), %xmm1
-; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
-; SSE2-NEXT:    pmovmskb %xmm1, %esi
+; SSE2-NEXT:    movq (%rdi), %r8
+; SSE2-NEXT:    movq 8(%rdi), %rdi
+; SSE2-NEXT:    xorq 8(%rsi), %rdi
+; SSE2-NEXT:    xorq (%rsi), %r8
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT:    movq %r8, %rsi
+; SSE2-NEXT:    orq %rdi, %rsi
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    orq %rax, (%rdx)
-; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
+; SSE2-NEXT:    orq %rdi, %r8
 ; SSE2-NEXT:    jne .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %if.then
 ; SSE2-NEXT:    orq %rax, (%rcx)
@@ -24,14 +25,16 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ;
 ; SSE4-LABEL: pr166534:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movdqu (%rdi), %xmm0
-; SSE4-NEXT:    movdqu (%rsi), %xmm1
-; SSE4-NEXT:    pxor %xmm0, %xmm1
+; SSE4-NEXT:    movq (%rdi), %r8
+; SSE4-NEXT:    movq 8(%rdi), %rdi
+; SSE4-NEXT:    xorq 8(%rsi), %rdi
+; SSE4-NEXT:    xorq (%rsi), %r8
 ; SSE4-NEXT:    xorl %eax, %eax
-; SSE4-NEXT:    ptest %xmm1, %xmm1
+; SSE4-NEXT:    movq %r8, %rsi
+; SSE4-NEXT:    orq %rdi, %rsi
 ; SSE4-NEXT:    sete %al
 ; SSE4-NEXT:    orq %rax, (%rdx)
-; SSE4-NEXT:    ptest %xmm1, %xmm1
+; SSE4-NEXT:    orq %rdi, %r8
 ; SSE4-NEXT:    jne .LBB0_2
 ; SSE4-NEXT:  # %bb.1: # %if.then
 ; SSE4-NEXT:    orq %rax, (%rcx)
@@ -40,13 +43,16 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ;
 ; AVX2-LABEL: pr166534:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX2-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; AVX2-NEXT:    movq (%rdi), %r8
+; AVX2-NEXT:    movq 8(%rdi), %rdi
+; AVX2-NEXT:    xorq 8(%rsi), %rdi
+; AVX2-NEXT:    xorq (%rsi), %r8
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    movq %r8, %rsi
+; AVX2-NEXT:    orq %rdi, %rsi
 ; AVX2-NEXT:    sete %al
 ; AVX2-NEXT:    orq %rax, (%rdx)
-; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    orq %rdi, %r8
 ; AVX2-NEXT:    jne .LBB0_2
 ; AVX2-NEXT:  # %bb.1: # %if.then
 ; AVX2-NEXT:    orq %rax, (%rcx)
@@ -55,13 +61,16 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ;
 ; AVX512-LABEL: pr166534:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
-; AVX512-NEXT:    vpxor (%rsi), %xmm0, %xmm0
+; AVX512-NEXT:    movq (%rdi), %r8
+; AVX512-NEXT:    movq 8(%rdi), %rdi
+; AVX512-NEXT:    xorq 8(%rsi), %rdi
+; AVX512-NEXT:    xorq (%rsi), %r8
 ; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    vptest %xmm0, %xmm0
+; AVX512-NEXT:    movq %r8, %rsi
+; AVX512-NEXT:    orq %rdi, %rsi
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    orq %rax, (%rdx)
-; AVX512-NEXT:    vptest %xmm0, %xmm0
+; AVX512-NEXT:    orq %rdi, %r8
 ; AVX512-NEXT:    jne .LBB0_2
 ; AVX512-NEXT:  # %bb.1: # %if.then
 ; AVX512-NEXT:    orq %rax, (%rcx)

>From 3fd4eea3881a34c11eff7b2f3cbd5046055a7011 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 00:26:12 +0000
Subject: [PATCH 2/8] format

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aefeb4bf38912..8c23ceabd9894 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2959,11 +2959,15 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
     case ISD::SUB:
     case ISD::FSHL:
     case ISD::FSHR:
-      return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget, AssumeSingleUse) &&
-             mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse);
+      return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget,
+                               AssumeSingleUse) &&
+             mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
+                               AssumeSingleUse);
     case ISD::SELECT:
-      return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget, AssumeSingleUse) &&
-             mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget, AssumeSingleUse);
+      return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
+                               AssumeSingleUse) &&
+             mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget,
+                               AssumeSingleUse);
     }
   }
   if (!ISD::isNormalLoad(Op.getNode()))

>From be09704213d653a1020bdff6db3cb2e10cee4ed9 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 00:41:24 +0000
Subject: [PATCH 3/8] remove todo

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 8c23ceabd9894..dcb42f0576fba 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -23565,7 +23565,6 @@ static SDValue combineVectorSizedSetCCEquality(EVT VT, SDValue X, SDValue Y,
     return SDValue();
 
   // Don't perform this combine if constructing the vector will be expensive.
-  // TODO: Drop AssumeSingleUse = true override.
   if ((!mayFoldIntoVector(X, DAG, Subtarget) ||
        !mayFoldIntoVector(Y, DAG, Subtarget)) &&
       !IsOrXorXorTreeCCZero)

>From 951da19e959cf464f53595e61b348a0640fc7bc2 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 19 Feb 2026 21:36:23 +0000
Subject: [PATCH 4/8] add backward handling for setcc, truncate and vector
 types

---
 llvm/lib/Target/X86/X86ISelLowering.cpp      | 30 +++++++-
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 80 +++++++++++++++-----
 llvm/test/CodeGen/X86/pr166534.ll            | 47 +++++-------
 llvm/test/CodeGen/X86/pr173924.ll            | 41 +++++-----
 4 files changed, 127 insertions(+), 71 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index dcb42f0576fba..71cb5edf385a4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2940,6 +2940,7 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
     return true;
   if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
     return true;
+
   EVT VT = Op.getValueType();
   unsigned Opcode = Op.getOpcode();
   if ((VT == MVT::i128 || VT == MVT::i256 || VT == MVT::i512) &&
@@ -2970,6 +2971,7 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
                                AssumeSingleUse);
     }
   }
+
   if (!ISD::isNormalLoad(Op.getNode()))
     return false;
 
@@ -2986,13 +2988,35 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
     if (ISD::isNormalStore(User))
       continue;
 
-    if (!mayFoldIntoVector(SDValue(User, 0), DAG, Subtarget,
+    if (User->getOpcode() == ISD::SETCC) {
+      ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
+      if (CC == ISD::SETEQ || CC == ISD::SETNE) {
+        if (mayFoldIntoVector(User->getOperand(0), DAG, Subtarget,
+                              /*AssumeSingleUse=*/true) &&
+            mayFoldIntoVector(User->getOperand(1), DAG, Subtarget,
+                              /*AssumeSingleUse=*/true))
+          continue;
+      }
+      return false;
+    }
+
+    if (User->getOpcode() == ISD::TRUNCATE)
+      continue;
+
+    SDValue Value = SDValue(User, 0);
+
+    if (isa<ConstantSDNode>(Value) || isa<ConstantFPSDNode>(Value))
+      continue;
+
+    if (Value.getValueType().isVector())
+      continue;
+
+    if (!mayFoldIntoVector(Value, DAG, Subtarget,
                            /*AssumeSingleUse=*/true))
       return false;
   }
 
-  // All users are vectorizable, now check the load itself
-  return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/false,
+  return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
                           /*IgnoreAlignment=*/true);
 }
 
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index 4980bc89ae74a..c880f39081baf 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1199,26 +1199,66 @@ define i32 @chain_reset_i256(ptr %p0, ptr %p1, ptr %p2, i32 %position) nounwind
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: chain_reset_i256:
-; X64:       # %bb.0:
-; X64-NEXT:    # kill: def $ecx killed $ecx def $rcx
-; X64-NEXT:    movl $-2, %eax
-; X64-NEXT:    roll %cl, %eax
-; X64-NEXT:    shrl $3, %ecx
-; X64-NEXT:    andl $28, %ecx
-; X64-NEXT:    andl %eax, (%rdi,%rcx)
-; X64-NEXT:    movq (%rdi), %rcx
-; X64-NEXT:    movq 8(%rdi), %r8
-; X64-NEXT:    orq 24(%rdi), %r8
-; X64-NEXT:    movq 16(%rdi), %rdi
-; X64-NEXT:    orq %rcx, %rdi
-; X64-NEXT:    movl (%rsi), %eax
-; X64-NEXT:    movl %ecx, (%rsi)
-; X64-NEXT:    movl (%rdx), %ecx
-; X64-NEXT:    addl %ecx, %eax
-; X64-NEXT:    orq %r8, %rdi
-; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    retq
+; SSE2-LABEL: chain_reset_i256:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; SSE2-NEXT:    movl $-2, %eax
+; SSE2-NEXT:    roll %cl, %eax
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    andl $28, %ecx
+; SSE2-NEXT:    andl %eax, (%rdi,%rcx)
+; SSE2-NEXT:    movl (%rdi), %ecx
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    por 16(%rdi), %xmm0
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    movmskps %xmm1, %edi
+; SSE2-NEXT:    xorl $15, %edi
+; SSE2-NEXT:    movl (%rsi), %eax
+; SSE2-NEXT:    movl %ecx, (%rsi)
+; SSE2-NEXT:    movl (%rdx), %ecx
+; SSE2-NEXT:    addl %ecx, %eax
+; SSE2-NEXT:    testl %edi, %edi
+; SSE2-NEXT:    cmovnel %ecx, %eax
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: chain_reset_i256:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; SSE4-NEXT:    movl $-2, %eax
+; SSE4-NEXT:    roll %cl, %eax
+; SSE4-NEXT:    shrl $3, %ecx
+; SSE4-NEXT:    andl $28, %ecx
+; SSE4-NEXT:    andl %eax, (%rdi,%rcx)
+; SSE4-NEXT:    movl (%rdi), %ecx
+; SSE4-NEXT:    movdqa (%rdi), %xmm0
+; SSE4-NEXT:    por 16(%rdi), %xmm0
+; SSE4-NEXT:    movl (%rsi), %eax
+; SSE4-NEXT:    movl %ecx, (%rsi)
+; SSE4-NEXT:    movl (%rdx), %ecx
+; SSE4-NEXT:    addl %ecx, %eax
+; SSE4-NEXT:    ptest %xmm0, %xmm0
+; SSE4-NEXT:    cmovnel %ecx, %eax
+; SSE4-NEXT:    retq
+;
+; AVX-LABEL: chain_reset_i256:
+; AVX:       # %bb.0:
+; AVX-NEXT:    # kill: def $ecx killed $ecx def $rcx
+; AVX-NEXT:    movl $-2, %eax
+; AVX-NEXT:    roll %cl, %eax
+; AVX-NEXT:    shrl $3, %ecx
+; AVX-NEXT:    andl $28, %ecx
+; AVX-NEXT:    andl %eax, (%rdi,%rcx)
+; AVX-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX-NEXT:    movl (%rdi), %ecx
+; AVX-NEXT:    movl (%rsi), %eax
+; AVX-NEXT:    movl %ecx, (%rsi)
+; AVX-NEXT:    movl (%rdx), %ecx
+; AVX-NEXT:    addl %ecx, %eax
+; AVX-NEXT:    vptest %ymm0, %ymm0
+; AVX-NEXT:    cmovnel %ecx, %eax
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    retq
   %rem = and i32 %position, 255
   %ofs = zext nneg i32 %rem to i256
   %bit = shl nuw i256 1, %ofs
diff --git a/llvm/test/CodeGen/X86/pr166534.ll b/llvm/test/CodeGen/X86/pr166534.ll
index fc31fe9ce4bd6..162a0c93bfcf4 100644
--- a/llvm/test/CodeGen/X86/pr166534.ll
+++ b/llvm/test/CodeGen/X86/pr166534.ll
@@ -7,16 +7,15 @@
 define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ; SSE2-LABEL: pr166534:
 ; SSE2:       # %bb.0: # %entry
-; SSE2-NEXT:    movq (%rdi), %r8
-; SSE2-NEXT:    movq 8(%rdi), %rdi
-; SSE2-NEXT:    xorq 8(%rsi), %rdi
-; SSE2-NEXT:    xorq (%rsi), %r8
+; SSE2-NEXT:    movdqu (%rdi), %xmm0
+; SSE2-NEXT:    movdqu (%rsi), %xmm1
+; SSE2-NEXT:    pcmpeqb %xmm0, %xmm1
+; SSE2-NEXT:    pmovmskb %xmm1, %esi
 ; SSE2-NEXT:    xorl %eax, %eax
-; SSE2-NEXT:    movq %r8, %rsi
-; SSE2-NEXT:    orq %rdi, %rsi
+; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    orq %rax, (%rdx)
-; SSE2-NEXT:    orq %rdi, %r8
+; SSE2-NEXT:    cmpl $65535, %esi # imm = 0xFFFF
 ; SSE2-NEXT:    jne .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %if.then
 ; SSE2-NEXT:    orq %rax, (%rcx)
@@ -25,16 +24,14 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ;
 ; SSE4-LABEL: pr166534:
 ; SSE4:       # %bb.0: # %entry
-; SSE4-NEXT:    movq (%rdi), %r8
-; SSE4-NEXT:    movq 8(%rdi), %rdi
-; SSE4-NEXT:    xorq 8(%rsi), %rdi
-; SSE4-NEXT:    xorq (%rsi), %r8
+; SSE4-NEXT:    movdqu (%rdi), %xmm0
+; SSE4-NEXT:    movdqu (%rsi), %xmm1
+; SSE4-NEXT:    pxor %xmm0, %xmm1
 ; SSE4-NEXT:    xorl %eax, %eax
-; SSE4-NEXT:    movq %r8, %rsi
-; SSE4-NEXT:    orq %rdi, %rsi
+; SSE4-NEXT:    ptest %xmm1, %xmm1
 ; SSE4-NEXT:    sete %al
 ; SSE4-NEXT:    orq %rax, (%rdx)
-; SSE4-NEXT:    orq %rdi, %r8
+; SSE4-NEXT:    ptest %xmm1, %xmm1
 ; SSE4-NEXT:    jne .LBB0_2
 ; SSE4-NEXT:  # %bb.1: # %if.then
 ; SSE4-NEXT:    orq %rax, (%rcx)
@@ -43,16 +40,13 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ;
 ; AVX2-LABEL: pr166534:
 ; AVX2:       # %bb.0: # %entry
-; AVX2-NEXT:    movq (%rdi), %r8
-; AVX2-NEXT:    movq 8(%rdi), %rdi
-; AVX2-NEXT:    xorq 8(%rsi), %rdi
-; AVX2-NEXT:    xorq (%rsi), %r8
+; AVX2-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX2-NEXT:    vpxor (%rsi), %xmm0, %xmm0
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    movq %r8, %rsi
-; AVX2-NEXT:    orq %rdi, %rsi
+; AVX2-NEXT:    vptest %xmm0, %xmm0
 ; AVX2-NEXT:    sete %al
 ; AVX2-NEXT:    orq %rax, (%rdx)
-; AVX2-NEXT:    orq %rdi, %r8
+; AVX2-NEXT:    vptest %xmm0, %xmm0
 ; AVX2-NEXT:    jne .LBB0_2
 ; AVX2-NEXT:  # %bb.1: # %if.then
 ; AVX2-NEXT:    orq %rax, (%rcx)
@@ -61,16 +55,13 @@ define void @pr166534(ptr %pa, ptr %pb, ptr %pc, ptr %pd) {
 ;
 ; AVX512-LABEL: pr166534:
 ; AVX512:       # %bb.0: # %entry
-; AVX512-NEXT:    movq (%rdi), %r8
-; AVX512-NEXT:    movq 8(%rdi), %rdi
-; AVX512-NEXT:    xorq 8(%rsi), %rdi
-; AVX512-NEXT:    xorq (%rsi), %r8
+; AVX512-NEXT:    vmovdqu (%rdi), %xmm0
+; AVX512-NEXT:    vpxor (%rsi), %xmm0, %xmm0
 ; AVX512-NEXT:    xorl %eax, %eax
-; AVX512-NEXT:    movq %r8, %rsi
-; AVX512-NEXT:    orq %rdi, %rsi
+; AVX512-NEXT:    vptest %xmm0, %xmm0
 ; AVX512-NEXT:    sete %al
 ; AVX512-NEXT:    orq %rax, (%rdx)
-; AVX512-NEXT:    orq %rdi, %r8
+; AVX512-NEXT:    vptest %xmm0, %xmm0
 ; AVX512-NEXT:    jne .LBB0_2
 ; AVX512-NEXT:  # %bb.1: # %if.then
 ; AVX512-NEXT:    orq %rax, (%rcx)
diff --git a/llvm/test/CodeGen/X86/pr173924.ll b/llvm/test/CodeGen/X86/pr173924.ll
index d130014a8fa62..17c048c05a7de 100644
--- a/llvm/test/CodeGen/X86/pr173924.ll
+++ b/llvm/test/CodeGen/X86/pr173924.ll
@@ -6,29 +6,30 @@ define i256 @PR173924(<8 x i256> %a0) {
 ; CHECK-LABEL: PR173924:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edi
-; CHECK-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %xmm0
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %edx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %ecx
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r8d
-; CHECK-NEXT:    movl {{[0-9]+}}(%rsp), %r10d
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
 ; CHECK-NEXT:    andl $1, %r10d
-; CHECK-NEXT:    andl $1, %esi
-; CHECK-NEXT:    addl %r10d, %esi
-; CHECK-NEXT:    andl $1, %r8d
-; CHECK-NEXT:    andl $1, %ecx
-; CHECK-NEXT:    addl %r8d, %ecx
-; CHECK-NEXT:    addl %esi, %ecx
+; CHECK-NEXT:    andl $1, %r9d
+; CHECK-NEXT:    addq %r10, %r9
+; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = [1,0,0,0]
+; CHECK-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; CHECK-NEXT:    vmovq %xmm1, %r10
 ; CHECK-NEXT:    andl $1, %edx
-; CHECK-NEXT:    vpand {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
-; CHECK-NEXT:    vmovq %xmm0, %rsi
+; CHECK-NEXT:    addq %r10, %rdx
+; CHECK-NEXT:    addq %r9, %rdx
+; CHECK-NEXT:    andl $1, %r8d
+; CHECK-NEXT:    andl $1, %esi
+; CHECK-NEXT:    addq %r8, %rsi
 ; CHECK-NEXT:    andl $1, %edi
-; CHECK-NEXT:    andl $1, %r9d
-; CHECK-NEXT:    addl %edi, %r9d
-; CHECK-NEXT:    addl %edx, %esi
-; CHECK-NEXT:    addl %r9d, %esi
-; CHECK-NEXT:    addl %ecx, %esi
-; CHECK-NEXT:    vmovd %esi, %xmm0
+; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    addq %rdi, %rcx
+; CHECK-NEXT:    addq %rsi, %rcx
+; CHECK-NEXT:    addq %rdx, %rcx
+; CHECK-NEXT:    vmovq %rcx, %xmm1
+; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
 ; CHECK-NEXT:    vmovdqu %ymm0, (%rax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq

>From 97d39e5cbec2299757b84b449678d9a5cdfe450a Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Tue, 10 Mar 2026 10:50:09 +0000
Subject: [PATCH 5/8] remove setcc traversal in load check

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 71cb5edf385a4..206b54c519319 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2990,13 +2990,8 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
 
     if (User->getOpcode() == ISD::SETCC) {
       ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
-      if (CC == ISD::SETEQ || CC == ISD::SETNE) {
-        if (mayFoldIntoVector(User->getOperand(0), DAG, Subtarget,
-                              /*AssumeSingleUse=*/true) &&
-            mayFoldIntoVector(User->getOperand(1), DAG, Subtarget,
-                              /*AssumeSingleUse=*/true))
-          continue;
-      }
+      if (CC == ISD::SETEQ || CC == ISD::SETNE)
+        continue;
       return false;
     }
 

>From 8c9c50d4fa6026abc83e5ba09d2bd057a3186a57 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 12 Mar 2026 09:00:39 +0000
Subject: [PATCH 6/8] remove load user checks

---
 llvm/lib/Target/X86/X86ISelLowering.cpp      |  55 +----
 llvm/test/CodeGen/X86/bittest-big-integer.ll | 235 +++++++++++--------
 llvm/test/CodeGen/X86/setcc-wide-types.ll    | 167 ++++++++-----
 llvm/test/CodeGen/X86/urem-seteq.ll          |   3 +-
 4 files changed, 251 insertions(+), 209 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 206b54c519319..8906f16a8dd17 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2934,8 +2934,7 @@ bool X86::mayFoldIntoZeroExtend(SDValue Op) {
 
 // Return true if its cheap to bitcast this to a vector type.
 static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
-                              const X86Subtarget &Subtarget,
-                              bool AssumeSingleUse = false) {
+                              const X86Subtarget &Subtarget) {
   if (peekThroughBitcasts(Op).getValueType().isVector())
     return true;
   if (isa<ConstantSDNode>(Op) || isa<ConstantFPSDNode>(Op))
@@ -2960,57 +2959,13 @@ static bool mayFoldIntoVector(SDValue Op, const SelectionDAG &DAG,
     case ISD::SUB:
     case ISD::FSHL:
     case ISD::FSHR:
-      return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget,
-                               AssumeSingleUse) &&
-             mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
-                               AssumeSingleUse);
+      return mayFoldIntoVector(Op.getOperand(0), DAG, Subtarget) &&
+             mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget);
     case ISD::SELECT:
-      return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget,
-                               AssumeSingleUse) &&
-             mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget,
-                               AssumeSingleUse);
+      return mayFoldIntoVector(Op.getOperand(1), DAG, Subtarget) &&
+             mayFoldIntoVector(Op.getOperand(2), DAG, Subtarget);
     }
   }
-
-  if (!ISD::isNormalLoad(Op.getNode()))
-    return false;
-
-  // Single-use loads just check the load itself
-  if (AssumeSingleUse || Op.hasOneUse())
-    return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
-                            /*IgnoreAlignment=*/true);
-
-  for (SDUse &Use : Op->uses()) {
-    if (Use.getResNo() != 0)
-      continue;
-
-    SDNode *User = Use.getUser();
-    if (ISD::isNormalStore(User))
-      continue;
-
-    if (User->getOpcode() == ISD::SETCC) {
-      ISD::CondCode CC = cast<CondCodeSDNode>(User->getOperand(2))->get();
-      if (CC == ISD::SETEQ || CC == ISD::SETNE)
-        continue;
-      return false;
-    }
-
-    if (User->getOpcode() == ISD::TRUNCATE)
-      continue;
-
-    SDValue Value = SDValue(User, 0);
-
-    if (isa<ConstantSDNode>(Value) || isa<ConstantFPSDNode>(Value))
-      continue;
-
-    if (Value.getValueType().isVector())
-      continue;
-
-    if (!mayFoldIntoVector(Value, DAG, Subtarget,
-                           /*AssumeSingleUse=*/true))
-      return false;
-  }
-
   return X86::mayFoldLoad(Op, Subtarget, /*AssumeSingleUse=*/true,
                           /*IgnoreAlignment=*/true);
 }
diff --git a/llvm/test/CodeGen/X86/bittest-big-integer.ll b/llvm/test/CodeGen/X86/bittest-big-integer.ll
index c880f39081baf..96ccc7b0f7527 100644
--- a/llvm/test/CodeGen/X86/bittest-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bittest-big-integer.ll
@@ -1892,114 +1892,157 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; X86-NEXT:    popl %ebp
 ; X86-NEXT:    retl
 ;
-; SSE-LABEL: blsr_u512:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 48(%rdi), %r11
-; SSE-NEXT:    movq 40(%rdi), %r9
-; SSE-NEXT:    movq 24(%rdi), %r8
-; SSE-NEXT:    movq 16(%rdi), %rdx
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %rsi
-; SSE-NEXT:    rep bsfq %rcx, %rax
-; SSE-NEXT:    rep bsfq %rsi, %rbx
-; SSE-NEXT:    addq $64, %rbx
-; SSE-NEXT:    testq %rcx, %rcx
-; SSE-NEXT:    cmovneq %rax, %rbx
-; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    rep bsfq %r8, %r10
-; SSE-NEXT:    addq $64, %r10
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovneq %rax, %r10
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    subq $-128, %r10
-; SSE-NEXT:    movq %rcx, %rax
-; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    cmovneq %rbx, %r10
-; SSE-NEXT:    rep bsfq %r14, %rax
-; SSE-NEXT:    rep bsfq %r9, %rbx
-; SSE-NEXT:    addq $64, %rbx
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovneq %rax, %rbx
-; SSE-NEXT:    rep bsfq %r11, %r15
-; SSE-NEXT:    movl $64, %eax
-; SSE-NEXT:    rep bsfq 56(%rdi), %rax
-; SSE-NEXT:    addq $64, %rax
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovneq %r15, %rax
-; SSE-NEXT:    subq $-128, %rax
-; SSE-NEXT:    orq %r9, %r14
-; SSE-NEXT:    cmovneq %rbx, %rax
-; SSE-NEXT:    addq $256, %rax # imm = 0x100
-; SSE-NEXT:    orq %r8, %rsi
-; SSE-NEXT:    orq %rdx, %rcx
-; SSE-NEXT:    orq %rsi, %rcx
-; SSE-NEXT:    cmovneq %r10, %rax
-; SSE-NEXT:    movl $-2, %edx
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    roll %cl, %edx
-; SSE-NEXT:    movl %eax, %ecx
-; SSE-NEXT:    shrl $3, %ecx
-; SSE-NEXT:    andl $60, %ecx
-; SSE-NEXT:    andl %edx, (%rdi,%rcx)
-; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    retq
+; SSE2-LABEL: blsr_u512:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    movdqa (%rdi), %xmm0
+; SSE2-NEXT:    movq 48(%rdi), %r8
+; SSE2-NEXT:    movq 40(%rdi), %rdx
+; SSE2-NEXT:    movq 32(%rdi), %rsi
+; SSE2-NEXT:    movq (%rdi), %rax
+; SSE2-NEXT:    movq 8(%rdi), %r9
+; SSE2-NEXT:    rep bsfq %rax, %rcx
+; SSE2-NEXT:    rep bsfq %r9, %r10
+; SSE2-NEXT:    addq $64, %r10
+; SSE2-NEXT:    testq %rax, %rax
+; SSE2-NEXT:    cmovneq %rcx, %r10
+; SSE2-NEXT:    movq 16(%rdi), %r11
+; SSE2-NEXT:    rep bsfq %r11, %rbx
+; SSE2-NEXT:    rep bsfq 24(%rdi), %rcx
+; SSE2-NEXT:    addq $64, %rcx
+; SSE2-NEXT:    testq %r11, %r11
+; SSE2-NEXT:    cmovneq %rbx, %rcx
+; SSE2-NEXT:    subq $-128, %rcx
+; SSE2-NEXT:    orq %r9, %rax
+; SSE2-NEXT:    cmovneq %r10, %rcx
+; SSE2-NEXT:    rep bsfq %rsi, %rax
+; SSE2-NEXT:    rep bsfq %rdx, %r9
+; SSE2-NEXT:    addq $64, %r9
+; SSE2-NEXT:    testq %rsi, %rsi
+; SSE2-NEXT:    cmovneq %rax, %r9
+; SSE2-NEXT:    rep bsfq %r8, %r10
+; SSE2-NEXT:    movl $64, %eax
+; SSE2-NEXT:    rep bsfq 56(%rdi), %rax
+; SSE2-NEXT:    addq $64, %rax
+; SSE2-NEXT:    testq %r8, %r8
+; SSE2-NEXT:    cmovneq %r10, %rax
+; SSE2-NEXT:    subq $-128, %rax
+; SSE2-NEXT:    orq %rdx, %rsi
+; SSE2-NEXT:    cmovneq %r9, %rax
+; SSE2-NEXT:    por 16(%rdi), %xmm0
+; SSE2-NEXT:    addq $256, %rax # imm = 0x100
+; SSE2-NEXT:    pxor %xmm1, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    movmskps %xmm1, %edx
+; SSE2-NEXT:    xorl $15, %edx
+; SSE2-NEXT:    cmovneq %rcx, %rax
+; SSE2-NEXT:    movl $-2, %edx
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    roll %cl, %edx
+; SSE2-NEXT:    movl %eax, %ecx
+; SSE2-NEXT:    shrl $3, %ecx
+; SSE2-NEXT:    andl $60, %ecx
+; SSE2-NEXT:    andl %edx, (%rdi,%rcx)
+; SSE2-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    retq
+;
+; SSE4-LABEL: blsr_u512:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    movdqa (%rdi), %xmm0
+; SSE4-NEXT:    movq 48(%rdi), %rdx
+; SSE4-NEXT:    movq 40(%rdi), %rcx
+; SSE4-NEXT:    movq (%rdi), %rax
+; SSE4-NEXT:    movq 8(%rdi), %r8
+; SSE4-NEXT:    rep bsfq %rax, %rsi
+; SSE4-NEXT:    rep bsfq %r8, %r9
+; SSE4-NEXT:    addq $64, %r9
+; SSE4-NEXT:    testq %rax, %rax
+; SSE4-NEXT:    cmovneq %rsi, %r9
+; SSE4-NEXT:    movq 16(%rdi), %r10
+; SSE4-NEXT:    rep bsfq %r10, %r11
+; SSE4-NEXT:    rep bsfq 24(%rdi), %rsi
+; SSE4-NEXT:    addq $64, %rsi
+; SSE4-NEXT:    testq %r10, %r10
+; SSE4-NEXT:    cmovneq %r11, %rsi
+; SSE4-NEXT:    subq $-128, %rsi
+; SSE4-NEXT:    orq %r8, %rax
+; SSE4-NEXT:    cmovneq %r9, %rsi
+; SSE4-NEXT:    movq 32(%rdi), %r8
+; SSE4-NEXT:    rep bsfq %r8, %rax
+; SSE4-NEXT:    rep bsfq %rcx, %r9
+; SSE4-NEXT:    addq $64, %r9
+; SSE4-NEXT:    testq %r8, %r8
+; SSE4-NEXT:    cmovneq %rax, %r9
+; SSE4-NEXT:    rep bsfq %rdx, %r10
+; SSE4-NEXT:    movl $64, %eax
+; SSE4-NEXT:    rep bsfq 56(%rdi), %rax
+; SSE4-NEXT:    addq $64, %rax
+; SSE4-NEXT:    testq %rdx, %rdx
+; SSE4-NEXT:    cmovneq %r10, %rax
+; SSE4-NEXT:    subq $-128, %rax
+; SSE4-NEXT:    orq %rcx, %r8
+; SSE4-NEXT:    cmovneq %r9, %rax
+; SSE4-NEXT:    addq $256, %rax # imm = 0x100
+; SSE4-NEXT:    por 16(%rdi), %xmm0
+; SSE4-NEXT:    ptest %xmm0, %xmm0
+; SSE4-NEXT:    cmovneq %rsi, %rax
+; SSE4-NEXT:    movl $-2, %edx
+; SSE4-NEXT:    movl %eax, %ecx
+; SSE4-NEXT:    roll %cl, %edx
+; SSE4-NEXT:    movl %eax, %ecx
+; SSE4-NEXT:    shrl $3, %ecx
+; SSE4-NEXT:    andl $60, %ecx
+; SSE4-NEXT:    andl %edx, (%rdi,%rcx)
+; SSE4-NEXT:    # kill: def $eax killed $eax killed $rax
+; SSE4-NEXT:    retq
 ;
 ; AVX2-LABEL: blsr_u512:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 40(%rdi), %r9
-; AVX2-NEXT:    movq 32(%rdi), %r10
-; AVX2-NEXT:    movq 24(%rdi), %r8
-; AVX2-NEXT:    movq 16(%rdi), %rdx
-; AVX2-NEXT:    movq (%rdi), %rcx
-; AVX2-NEXT:    movq 8(%rdi), %rsi
-; AVX2-NEXT:    tzcntq %rcx, %rax
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
+; AVX2-NEXT:    movq 48(%rdi), %rsi
+; AVX2-NEXT:    movq 40(%rdi), %rcx
+; AVX2-NEXT:    movq 32(%rdi), %rdx
+; AVX2-NEXT:    movq 16(%rdi), %rax
+; AVX2-NEXT:    movq (%rdi), %r9
+; AVX2-NEXT:    movq 8(%rdi), %r10
+; AVX2-NEXT:    tzcntq %r9, %r8
+; AVX2-NEXT:    tzcntq %r10, %r11
+; AVX2-NEXT:    addq $64, %r11
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovneq %r8, %r11
 ; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    tzcntq %rsi, %rbx
-; AVX2-NEXT:    addq $64, %rbx
-; AVX2-NEXT:    testq %rcx, %rcx
-; AVX2-NEXT:    cmovneq %rax, %rbx
+; AVX2-NEXT:    tzcntq %rax, %rbx
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq 24(%rdi), %r8
+; AVX2-NEXT:    addq $64, %r8
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovneq %rbx, %r8
+; AVX2-NEXT:    subq $-128, %r8
+; AVX2-NEXT:    orq %r10, %r9
+; AVX2-NEXT:    cmovneq %r11, %r8
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq %rdx, %rax
-; AVX2-NEXT:    tzcntq %r8, %r11
-; AVX2-NEXT:    addq $64, %r11
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    tzcntq %rcx, %r9
+; AVX2-NEXT:    addq $64, %r9
 ; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovneq %rax, %r11
-; AVX2-NEXT:    subq $-128, %r11
-; AVX2-NEXT:    movq %rcx, %rax
-; AVX2-NEXT:    orq %rsi, %rax
-; AVX2-NEXT:    cmovneq %rbx, %r11
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r10, %rax
-; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    tzcntq %r9, %rbx
-; AVX2-NEXT:    addq $64, %rbx
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovneq %rax, %rbx
-; AVX2-NEXT:    movq 48(%rdi), %r14
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %r14, %r15
+; AVX2-NEXT:    cmovneq %rax, %r9
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %rsi, %r10
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq 56(%rdi), %rax
 ; AVX2-NEXT:    addq $64, %rax
-; AVX2-NEXT:    testq %r14, %r14
-; AVX2-NEXT:    cmovneq %r15, %rax
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    cmovneq %r10, %rax
 ; AVX2-NEXT:    subq $-128, %rax
-; AVX2-NEXT:    orq %r9, %r10
-; AVX2-NEXT:    cmovneq %rbx, %rax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovneq %r9, %rax
 ; AVX2-NEXT:    addq $256, %rax # imm = 0x100
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    orq %rsi, %rcx
-; AVX2-NEXT:    cmovneq %r11, %rax
+; AVX2-NEXT:    vpor 16(%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    cmovneq %r8, %rax
 ; AVX2-NEXT:    movl $-2, %edx
 ; AVX2-NEXT:    movl %eax, %ecx
 ; AVX2-NEXT:    roll %cl, %edx
@@ -2009,8 +2052,6 @@ define i32 @blsr_u512(ptr %word) nounwind {
 ; AVX2-NEXT:    andl %edx, (%rdi,%rcx)
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
 ; AVX512-LABEL: blsr_u512:
diff --git a/llvm/test/CodeGen/X86/setcc-wide-types.ll b/llvm/test/CodeGen/X86/setcc-wide-types.ll
index 1e53dc01ed168..59de15b6a43a8 100644
--- a/llvm/test/CodeGen/X86/setcc-wide-types.ll
+++ b/llvm/test/CodeGen/X86/setcc-wide-types.ll
@@ -376,17 +376,17 @@ define i1 @ne_v4i256(<4 x i256> %a0) {
 ;
 ; AVX1-LABEL: ne_v4i256:
 ; AVX1:       # %bb.0:
-; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX1-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX1-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; AVX1-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
 ; AVX1-NEXT:    orq {{[0-9]+}}(%rsp), %rcx
-; AVX1-NEXT:    orq %r10, %rcx
-; AVX1-NEXT:    vmovq %rcx, %xmm0
-; AVX1-NEXT:    orq {{[0-9]+}}(%rsp), %rax
+; AVX1-NEXT:    vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX1-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX1-NEXT:    orq %rcx, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm1
 ; AVX1-NEXT:    orq {{[0-9]+}}(%rsp), %rdx
-; AVX1-NEXT:    orq %rax, %rdx
-; AVX1-NEXT:    vmovq %rdx, %xmm1
-; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX1-NEXT:    vmovq %xmm0, %rax
+; AVX1-NEXT:    orq %rdx, %rax
+; AVX1-NEXT:    vmovq %rax, %xmm0
+; AVX1-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX1-NEXT:    orq {{[0-9]+}}(%rsp), %r9
 ; AVX1-NEXT:    orq {{[0-9]+}}(%rsp), %rsi
 ; AVX1-NEXT:    orq %r9, %rsi
@@ -404,17 +404,17 @@ define i1 @ne_v4i256(<4 x i256> %a0) {
 ;
 ; AVX2-LABEL: ne_v4i256:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
 ; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    orq %r10, %rcx
-; AVX2-NEXT:    vmovq %rcx, %xmm0
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX2-NEXT:    orq %rcx, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm1
 ; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    orq %rax, %rdx
-; AVX2-NEXT:    vmovq %rdx, %xmm1
-; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    orq %rdx, %rax
+; AVX2-NEXT:    vmovq %rax, %xmm0
+; AVX2-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm0[0],xmm1[0]
 ; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r9
 ; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rsi
 ; AVX2-NEXT:    orq %r9, %rsi
@@ -1068,27 +1068,49 @@ define i1 @eq_i256_args(i256 %a, i256 %b) {
 }
 
 define i1 @eq_i512_args(i512 %a, i512 %b) {
-; CHECK-LABEL: eq_i512_args:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
-; CHECK-NEXT:    orq %r10, %rcx
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
-; CHECK-NEXT:    orq %r9, %rsi
-; CHECK-NEXT:    orq %rcx, %rsi
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
-; CHECK-NEXT:    orq %rax, %rdx
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT:    orq %r8, %rdi
-; CHECK-NEXT:    orq %rdx, %rdi
-; CHECK-NEXT:    orq %rsi, %rdi
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; SSE-LABEL: eq_i512_args:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    orq %r10, %rcx
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    orq %r9, %rsi
+; SSE-NEXT:    orq %rcx, %rsi
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    orq %rax, %rdx
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
+; SSE-NEXT:    orq %r8, %rdi
+; SSE-NEXT:    orq %rdx, %rdi
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_i512_args:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
+; AVX-NEXT:    xorq {{[0-9]+}}(%rsp), %r9
+; AVX-NEXT:    xorq {{[0-9]+}}(%rsp), %rsi
+; AVX-NEXT:    orq %r9, %rsi
+; AVX-NEXT:    xorq {{[0-9]+}}(%rsp), %rcx
+; AVX-NEXT:    vpxor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rax
+; AVX-NEXT:    orq %rcx, %rax
+; AVX-NEXT:    orq %rsi, %rax
+; AVX-NEXT:    xorq {{[0-9]+}}(%rsp), %r8
+; AVX-NEXT:    xorq {{[0-9]+}}(%rsp), %rdi
+; AVX-NEXT:    orq %r8, %rdi
+; AVX-NEXT:    xorq {{[0-9]+}}(%rsp), %rdx
+; AVX-NEXT:    vmovq %xmm0, %rcx
+; AVX-NEXT:    orq %rdx, %rcx
+; AVX-NEXT:    orq %rdi, %rcx
+; AVX-NEXT:    orq %rax, %rcx
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    retq
   %r = icmp eq i512 %a, %b
   ret i1 %r
 }
@@ -1225,28 +1247,51 @@ define i1 @eq_i256_load_arg(ptr%p, i256 %b) {
 }
 
 define i1 @eq_i512_load_arg(ptr%p, i512 %b) {
-; CHECK-LABEL: eq_i512_load_arg:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    movq 40(%rdi), %rax
-; CHECK-NEXT:    movq 48(%rdi), %r10
-; CHECK-NEXT:    movq 56(%rdi), %r11
-; CHECK-NEXT:    xorq 24(%rdi), %r8
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %r11
-; CHECK-NEXT:    orq %r8, %r11
-; CHECK-NEXT:    xorq 8(%rdi), %rdx
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
-; CHECK-NEXT:    orq %rdx, %rax
-; CHECK-NEXT:    orq %r11, %rax
-; CHECK-NEXT:    xorq 32(%rdi), %r9
-; CHECK-NEXT:    xorq (%rdi), %rsi
-; CHECK-NEXT:    orq %r9, %rsi
-; CHECK-NEXT:    xorq 16(%rdi), %rcx
-; CHECK-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT:    orq %rcx, %r10
-; CHECK-NEXT:    orq %rsi, %r10
-; CHECK-NEXT:    orq %rax, %r10
-; CHECK-NEXT:    sete %al
-; CHECK-NEXT:    retq
+; SSE-LABEL: eq_i512_load_arg:
+; SSE:       # %bb.0:
+; SSE-NEXT:    movq 40(%rdi), %rax
+; SSE-NEXT:    movq 48(%rdi), %r10
+; SSE-NEXT:    movq 56(%rdi), %r11
+; SSE-NEXT:    xorq 24(%rdi), %r8
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    orq %r8, %r11
+; SSE-NEXT:    xorq 8(%rdi), %rdx
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    orq %rdx, %rax
+; SSE-NEXT:    orq %r11, %rax
+; SSE-NEXT:    xorq 32(%rdi), %r9
+; SSE-NEXT:    xorq (%rdi), %rsi
+; SSE-NEXT:    orq %r9, %rsi
+; SSE-NEXT:    xorq 16(%rdi), %rcx
+; SSE-NEXT:    xorq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    orq %rcx, %r10
+; SSE-NEXT:    orq %rsi, %r10
+; SSE-NEXT:    orq %rax, %r10
+; SSE-NEXT:    sete %al
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: eq_i512_load_arg:
+; AVX:       # %bb.0:
+; AVX-NEXT:    vmovdqa 48(%rdi), %xmm0
+; AVX-NEXT:    movq 40(%rdi), %rax
+; AVX-NEXT:    xorq 8(%rdi), %rdx
+; AVX-NEXT:    xorq {{[0-9]+}}(%rsp), %rax
+; AVX-NEXT:    orq %rdx, %rax
+; AVX-NEXT:    xorq 24(%rdi), %r8
+; AVX-NEXT:    vpxor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX-NEXT:    orq %r8, %rdx
+; AVX-NEXT:    orq %rax, %rdx
+; AVX-NEXT:    xorq 32(%rdi), %r9
+; AVX-NEXT:    xorq (%rdi), %rsi
+; AVX-NEXT:    orq %r9, %rsi
+; AVX-NEXT:    xorq 16(%rdi), %rcx
+; AVX-NEXT:    vmovq %xmm0, %rax
+; AVX-NEXT:    orq %rcx, %rax
+; AVX-NEXT:    orq %rsi, %rax
+; AVX-NEXT:    orq %rdx, %rax
+; AVX-NEXT:    sete %al
+; AVX-NEXT:    retq
   %a = load i512, ptr %p
   %r = icmp eq i512 %a, %b
   ret i1 %r
diff --git a/llvm/test/CodeGen/X86/urem-seteq.ll b/llvm/test/CodeGen/X86/urem-seteq.ll
index 72e91ce80d1a5..9441ccf2a2843 100644
--- a/llvm/test/CodeGen/X86/urem-seteq.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq.ll
@@ -369,7 +369,8 @@ define void @ossfuzz34366() {
 ;
 ; X64-LABEL: ossfuzz34366:
 ; X64:       # %bb.0:
-; X64-NEXT:    cmpq $0, (%rax)
+; X64-NEXT:    movq (%rax), %rax
+; X64-NEXT:    orq %rax, %rax
 ; X64-NEXT:    sete (%rax)
 ; X64-NEXT:    retq
   %L10 = load i448, ptr undef, align 4

>From 875449af23b9710c35dd1e1ce11c2d498333804d Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Thu, 12 Mar 2026 10:19:10 +0000
Subject: [PATCH 7/8] update tests on rebase

---
 llvm/test/CodeGen/X86/bit-manip-i256.ll     |  263 +-
 llvm/test/CodeGen/X86/bit-manip-i512.ll     | 1001 ++--
 llvm/test/CodeGen/X86/bitcnt-big-integer.ll | 4693 ++++++++-----------
 llvm/test/CodeGen/X86/funnel-shift-i256.ll  |  278 +-
 llvm/test/CodeGen/X86/funnel-shift-i512.ll  | 1129 ++---
 llvm/test/CodeGen/X86/ucmp.ll               | 2306 +++++----
 6 files changed, 4303 insertions(+), 5367 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bit-manip-i256.ll b/llvm/test/CodeGen/X86/bit-manip-i256.ll
index 1867f068828a2..dba8d0d3dd07f 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i256.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i256.ll
@@ -2983,67 +2983,124 @@ define i256 @isolate_msb_i256_vector(<4 x i64> %v0, i256 %idx) nounwind {
 }
 
 define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
-; SSE-LABEL: isolate_msb_i256_load:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 16(%rsi), %r8
-; SSE-NEXT:    movq 24(%rsi), %r9
-; SSE-NEXT:    movq (%rsi), %rax
-; SSE-NEXT:    movq 8(%rsi), %rsi
-; SSE-NEXT:    movq %rsi, %rdx
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    bsrq %rax, %rcx
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    bsrq %r9, %r10
-; SSE-NEXT:    xorq $63, %r10
-; SSE-NEXT:    bsrq %r8, %r11
-; SSE-NEXT:    xorq $63, %r11
-; SSE-NEXT:    orq $64, %r11
-; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    cmovneq %r10, %r11
-; SSE-NEXT:    bsrq %rsi, %r10
-; SSE-NEXT:    xorq $63, %r10
-; SSE-NEXT:    xorq $63, %rcx
-; SSE-NEXT:    orq $64, %rcx
-; SSE-NEXT:    testq %rsi, %rsi
-; SSE-NEXT:    cmovneq %r10, %rcx
-; SSE-NEXT:    orq $128, %rcx
-; SSE-NEXT:    orq %r9, %r8
-; SSE-NEXT:    cmovneq %r11, %rcx
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movabsq $-9223372036854775808, %rsi # imm = 0x8000000000000000
-; SSE-NEXT:    movq %rsi, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %ecx, %esi
-; SSE-NEXT:    shrb $6, %sil
-; SSE-NEXT:    movzbl %sil, %r8d
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -40(%rsp,%r8,8), %rsi
-; SSE-NEXT:    movq -48(%rsp,%r8,8), %r9
-; SSE-NEXT:    movq %r9, %r10
-; SSE-NEXT:    shrdq %cl, %rsi, %r10
-; SSE-NEXT:    movq -56(%rsp,%r8,8), %r11
-; SSE-NEXT:    movq %r11, %rbx
-; SSE-NEXT:    shrdq %cl, %r9, %rbx
-; SSE-NEXT:    movq -64(%rsp,%r8,8), %r8
-; SSE-NEXT:    shrq %cl, %rsi
-; SSE-NEXT:    # kill: def $cl killed $cl killed $rcx
-; SSE-NEXT:    shrdq %cl, %r11, %r8
-; SSE-NEXT:    xorl %ecx, %ecx
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    cmoveq %rcx, %rbx
-; SSE-NEXT:    cmoveq %rcx, %r10
-; SSE-NEXT:    cmoveq %rcx, %r8
-; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    cmoveq %rcx, %rsi
-; SSE-NEXT:    movq %rsi, 24(%rdi)
-; SSE-NEXT:    movq %r10, 16(%rdi)
-; SSE-NEXT:    movq %rbx, 8(%rdi)
-; SSE-NEXT:    movq %r8, (%rdi)
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    retq
+; SSE2-LABEL: isolate_msb_i256_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    movq 8(%rsi), %r9
+; SSE2-NEXT:    movq 16(%rsi), %rdx
+; SSE2-NEXT:    movq 24(%rsi), %r8
+; SSE2-NEXT:    movdqa (%rsi), %xmm1
+; SSE2-NEXT:    por 16(%rsi), %xmm1
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    movmskps %xmm1, %eax
+; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    bsrq %r8, %rcx
+; SSE2-NEXT:    xorq $63, %rcx
+; SSE2-NEXT:    bsrq %rdx, %r10
+; SSE2-NEXT:    xorq $63, %r10
+; SSE2-NEXT:    orq $64, %r10
+; SSE2-NEXT:    testq %r8, %r8
+; SSE2-NEXT:    cmovneq %rcx, %r10
+; SSE2-NEXT:    bsrq %r9, %r11
+; SSE2-NEXT:    xorq $63, %r11
+; SSE2-NEXT:    bsrq (%rsi), %rcx
+; SSE2-NEXT:    xorq $63, %rcx
+; SSE2-NEXT:    orq $64, %rcx
+; SSE2-NEXT:    testq %r9, %r9
+; SSE2-NEXT:    cmovneq %r11, %rcx
+; SSE2-NEXT:    orq $128, %rcx
+; SSE2-NEXT:    orq %r8, %rdx
+; SSE2-NEXT:    cmovneq %r10, %rcx
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movabsq $-9223372036854775808, %rdx # imm = 0x8000000000000000
+; SSE2-NEXT:    movq %rdx, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movl %ecx, %edx
+; SSE2-NEXT:    shrb $6, %dl
+; SSE2-NEXT:    movzbl %dl, %esi
+; SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movq -48(%rsp,%rsi,8), %rdx
+; SSE2-NEXT:    movq -56(%rsp,%rsi,8), %r8
+; SSE2-NEXT:    movq %r8, %r9
+; SSE2-NEXT:    shrdq %cl, %rdx, %r9
+; SSE2-NEXT:    movq -64(%rsp,%rsi,8), %r10
+; SSE2-NEXT:    movq %r10, %r11
+; SSE2-NEXT:    shrdq %cl, %r8, %r11
+; SSE2-NEXT:    movq -72(%rsp,%rsi,8), %rsi
+; SSE2-NEXT:    shrq %cl, %rdx
+; SSE2-NEXT:    # kill: def $cl killed $cl killed $rcx
+; SSE2-NEXT:    shrdq %cl, %r10, %rsi
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    cmoveq %rcx, %r11
+; SSE2-NEXT:    cmoveq %rcx, %r9
+; SSE2-NEXT:    cmoveq %rcx, %rsi
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    cmoveq %rcx, %rdx
+; SSE2-NEXT:    movq %rdx, 24(%rdi)
+; SSE2-NEXT:    movq %r9, 16(%rdi)
+; SSE2-NEXT:    movq %r11, 8(%rdi)
+; SSE2-NEXT:    movq %rsi, (%rdi)
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: isolate_msb_i256_load:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    movq 16(%rsi), %rax
+; SSE42-NEXT:    movq 24(%rsi), %rdx
+; SSE42-NEXT:    movdqa (%rsi), %xmm0
+; SSE42-NEXT:    por 16(%rsi), %xmm0
+; SSE42-NEXT:    bsrq %rdx, %rcx
+; SSE42-NEXT:    xorq $63, %rcx
+; SSE42-NEXT:    bsrq %rax, %r8
+; SSE42-NEXT:    xorq $63, %r8
+; SSE42-NEXT:    orq $64, %r8
+; SSE42-NEXT:    testq %rdx, %rdx
+; SSE42-NEXT:    cmovneq %rcx, %r8
+; SSE42-NEXT:    movq 8(%rsi), %r9
+; SSE42-NEXT:    bsrq %r9, %r10
+; SSE42-NEXT:    bsrq (%rsi), %rcx
+; SSE42-NEXT:    xorq $63, %r10
+; SSE42-NEXT:    xorq $63, %rcx
+; SSE42-NEXT:    orq $64, %rcx
+; SSE42-NEXT:    testq %r9, %r9
+; SSE42-NEXT:    cmovneq %r10, %rcx
+; SSE42-NEXT:    orq $128, %rcx
+; SSE42-NEXT:    orq %rdx, %rax
+; SSE42-NEXT:    cmovneq %r8, %rcx
+; SSE42-NEXT:    xorps %xmm1, %xmm1
+; SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movabsq $-9223372036854775808, %rax # imm = 0x8000000000000000
+; SSE42-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movl %ecx, %eax
+; SSE42-NEXT:    shrb $6, %al
+; SSE42-NEXT:    movzbl %al, %eax
+; SSE42-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movq -48(%rsp,%rax,8), %rdx
+; SSE42-NEXT:    movq -56(%rsp,%rax,8), %rsi
+; SSE42-NEXT:    movq %rsi, %r8
+; SSE42-NEXT:    shrdq %cl, %rdx, %r8
+; SSE42-NEXT:    movq -64(%rsp,%rax,8), %r9
+; SSE42-NEXT:    movq %r9, %r10
+; SSE42-NEXT:    shrdq %cl, %rsi, %r10
+; SSE42-NEXT:    movq -72(%rsp,%rax,8), %rsi
+; SSE42-NEXT:    shrq %cl, %rdx
+; SSE42-NEXT:    # kill: def $cl killed $cl killed $rcx
+; SSE42-NEXT:    shrdq %cl, %r9, %rsi
+; SSE42-NEXT:    xorl %ecx, %ecx
+; SSE42-NEXT:    ptest %xmm0, %xmm0
+; SSE42-NEXT:    cmoveq %rcx, %r10
+; SSE42-NEXT:    cmoveq %rcx, %r8
+; SSE42-NEXT:    cmoveq %rcx, %rsi
+; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    cmoveq %rcx, %rdx
+; SSE42-NEXT:    movq %rdx, 24(%rdi)
+; SSE42-NEXT:    movq %r8, 16(%rdi)
+; SSE42-NEXT:    movq %r10, 8(%rdi)
+; SSE42-NEXT:    movq %rsi, (%rdi)
+; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: isolate_msb_i256_load:
 ; AVX2:       # %bb.0:
@@ -3098,25 +3155,17 @@ define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
 ;
 ; AVX512F-LABEL: isolate_msb_i256_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq 8(%rsi), %r8
-; AVX512F-NEXT:    movq 16(%rsi), %rax
-; AVX512F-NEXT:    movq 24(%rsi), %rdx
 ; AVX512F-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX512F-NEXT:    lzcntq %rdx, %rcx
-; AVX512F-NEXT:    lzcntq %rax, %r9
-; AVX512F-NEXT:    addq $64, %r9
-; AVX512F-NEXT:    testq %rdx, %rdx
-; AVX512F-NEXT:    cmovneq %rcx, %r9
-; AVX512F-NEXT:    lzcntq (%rsi), %rcx
 ; AVX512F-NEXT:    vmovaps {{.*#+}} zmm1 = [0,0,0,9223372036854775808,0,0,0,0]
 ; AVX512F-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    lzcntq %r8, %rsi
-; AVX512F-NEXT:    addq $64, %rcx
-; AVX512F-NEXT:    testq %r8, %r8
-; AVX512F-NEXT:    cmovneq %rsi, %rcx
-; AVX512F-NEXT:    subq $-128, %rcx
-; AVX512F-NEXT:    orq %rdx, %rax
-; AVX512F-NEXT:    cmovneq %r9, %rcx
+; AVX512F-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
+; AVX512F-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512F-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512F-NEXT:    kshiftrw $12, %k0, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm1, %ecx
 ; AVX512F-NEXT:    movl %ecx, %eax
 ; AVX512F-NEXT:    shrb $6, %al
 ; AVX512F-NEXT:    movzbl %al, %edx
@@ -3147,26 +3196,16 @@ define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
 ; AVX512VL-LABEL: isolate_msb_i256_load:
 ; AVX512VL:       # %bb.0:
 ; AVX512VL-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX512VL-NEXT:    movq 8(%rsi), %rax
-; AVX512VL-NEXT:    movq 16(%rsi), %rdx
-; AVX512VL-NEXT:    movq 24(%rsi), %r8
-; AVX512VL-NEXT:    lzcntq %r8, %rcx
-; AVX512VL-NEXT:    lzcntq %rdx, %r9
-; AVX512VL-NEXT:    addq $64, %r9
-; AVX512VL-NEXT:    testq %r8, %r8
-; AVX512VL-NEXT:    cmovneq %rcx, %r9
-; AVX512VL-NEXT:    lzcntq %rax, %r10
-; AVX512VL-NEXT:    lzcntq (%rsi), %rcx
-; AVX512VL-NEXT:    addq $64, %rcx
-; AVX512VL-NEXT:    testq %rax, %rax
-; AVX512VL-NEXT:    cmovneq %r10, %rcx
-; AVX512VL-NEXT:    subq $-128, %rcx
-; AVX512VL-NEXT:    orq %r8, %rdx
-; AVX512VL-NEXT:    cmovneq %r9, %rcx
-; AVX512VL-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; AVX512VL-NEXT:    vptestmq %ymm1, %ymm1, %k1
+; AVX512VL-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512VL-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VL-NEXT:    vpcompressq %ymm1, %ymm1 {%k1} {z}
+; AVX512VL-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
+; AVX512VL-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    vmovd %xmm1, %ecx
 ; AVX512VL-NEXT:    movl %ecx, %eax
 ; AVX512VL-NEXT:    shrb $6, %al
 ; AVX512VL-NEXT:    movzbl %al, %edx
@@ -3197,26 +3236,16 @@ define i256 @isolate_msb_i256_load(ptr %p0, i256 %idx) nounwind {
 ; AVX512VBMI-LABEL: isolate_msb_i256_load:
 ; AVX512VBMI:       # %bb.0:
 ; AVX512VBMI-NEXT:    vmovdqu (%rsi), %ymm0
-; AVX512VBMI-NEXT:    movq 8(%rsi), %rax
-; AVX512VBMI-NEXT:    movq 16(%rsi), %rdx
-; AVX512VBMI-NEXT:    movq 24(%rsi), %r8
-; AVX512VBMI-NEXT:    lzcntq %r8, %rcx
-; AVX512VBMI-NEXT:    lzcntq %rdx, %r9
-; AVX512VBMI-NEXT:    addq $64, %r9
-; AVX512VBMI-NEXT:    testq %r8, %r8
-; AVX512VBMI-NEXT:    cmovneq %rcx, %r9
-; AVX512VBMI-NEXT:    lzcntq %rax, %r10
-; AVX512VBMI-NEXT:    lzcntq (%rsi), %rcx
-; AVX512VBMI-NEXT:    addq $64, %rcx
-; AVX512VBMI-NEXT:    testq %rax, %rax
-; AVX512VBMI-NEXT:    cmovneq %r10, %rcx
-; AVX512VBMI-NEXT:    subq $-128, %rcx
-; AVX512VBMI-NEXT:    orq %r8, %rdx
-; AVX512VBMI-NEXT:    cmovneq %r9, %rcx
-; AVX512VBMI-NEXT:    vxorps %xmm1, %xmm1, %xmm1
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vpermq {{.*#+}} ymm1 = ymm0[3,2,1,0]
+; AVX512VBMI-NEXT:    vptestmq %ymm1, %ymm1, %k1
+; AVX512VBMI-NEXT:    vplzcntq %ymm1, %ymm1
+; AVX512VBMI-NEXT:    vpaddq {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm1
+; AVX512VBMI-NEXT:    vpcompressq %ymm1, %ymm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vxorps %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
+; AVX512VBMI-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX512VBMI-NEXT:    vmovd %xmm1, %ecx
 ; AVX512VBMI-NEXT:    movl %ecx, %eax
 ; AVX512VBMI-NEXT:    shrb $6, %al
 ; AVX512VBMI-NEXT:    movzbl %al, %edx
diff --git a/llvm/test/CodeGen/X86/bit-manip-i512.ll b/llvm/test/CodeGen/X86/bit-manip-i512.ll
index 3723280d2bfa2..407df83d1b2c3 100644
--- a/llvm/test/CodeGen/X86/bit-manip-i512.ll
+++ b/llvm/test/CodeGen/X86/bit-manip-i512.ll
@@ -1826,178 +1826,62 @@ define i512 @blsi_i512_load(ptr %p0) nounwind {
 ;
 ; AVX512F-LABEL: blsi_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %rbp
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
-; AVX512F-NEXT:    pushq %r12
-; AVX512F-NEXT:    pushq %rbx
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    movq 56(%rsi), %r14
-; AVX512F-NEXT:    movq 48(%rsi), %rbx
-; AVX512F-NEXT:    movq 40(%rsi), %r10
-; AVX512F-NEXT:    movq (%rsi), %rdi
-; AVX512F-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    movq 8(%rsi), %rcx
-; AVX512F-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512F-NEXT:    xorl %r8d, %r8d
-; AVX512F-NEXT:    negq %rdi
-; AVX512F-NEXT:    movl $0, %r9d
-; AVX512F-NEXT:    sbbq %rcx, %r9
-; AVX512F-NEXT:    movq 16(%rsi), %r15
-; AVX512F-NEXT:    movl $0, %r11d
-; AVX512F-NEXT:    sbbq %r15, %r11
-; AVX512F-NEXT:    movq 24(%rsi), %r12
-; AVX512F-NEXT:    movl $0, %r13d
-; AVX512F-NEXT:    sbbq %r12, %r13
-; AVX512F-NEXT:    movq 32(%rsi), %rsi
-; AVX512F-NEXT:    movl $0, %ebp
-; AVX512F-NEXT:    sbbq %rsi, %rbp
-; AVX512F-NEXT:    movl $0, %edx
-; AVX512F-NEXT:    sbbq %r10, %rdx
-; AVX512F-NEXT:    movl $0, %ecx
-; AVX512F-NEXT:    sbbq %rbx, %rcx
-; AVX512F-NEXT:    sbbq %r14, %r8
-; AVX512F-NEXT:    andq %r14, %r8
-; AVX512F-NEXT:    andq %rbx, %rcx
-; AVX512F-NEXT:    andq %r10, %rdx
-; AVX512F-NEXT:    andq %rsi, %rbp
-; AVX512F-NEXT:    andq %r12, %r13
-; AVX512F-NEXT:    andq %r15, %r11
-; AVX512F-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX512F-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Folded Reload
-; AVX512F-NEXT:    movq %rdi, (%rax)
-; AVX512F-NEXT:    movq %r9, 8(%rax)
-; AVX512F-NEXT:    movq %r11, 16(%rax)
-; AVX512F-NEXT:    movq %r13, 24(%rax)
-; AVX512F-NEXT:    movq %rbp, 32(%rax)
-; AVX512F-NEXT:    movq %rdx, 40(%rax)
-; AVX512F-NEXT:    movq %rcx, 48(%rax)
-; AVX512F-NEXT:    movq %r8, 56(%rax)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
-; AVX512F-NEXT:    popq %rbp
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512F-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512F-NEXT:    vpxorq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: blsi_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %rbp
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %r13
-; AVX512VL-NEXT:    pushq %r12
-; AVX512VL-NEXT:    pushq %rbx
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq 56(%rsi), %r11
-; AVX512VL-NEXT:    movq 48(%rsi), %rbx
-; AVX512VL-NEXT:    movq 40(%rsi), %r14
-; AVX512VL-NEXT:    movq 32(%rsi), %r15
-; AVX512VL-NEXT:    movq 24(%rsi), %r9
-; AVX512VL-NEXT:    movq 16(%rsi), %rdx
-; AVX512VL-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT:    movq (%rsi), %rcx
-; AVX512VL-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VL-NEXT:    movq 8(%rsi), %r8
-; AVX512VL-NEXT:    xorl %edi, %edi
-; AVX512VL-NEXT:    movq %rcx, %rsi
-; AVX512VL-NEXT:    negq %rsi
-; AVX512VL-NEXT:    movl $0, %r10d
-; AVX512VL-NEXT:    sbbq %r8, %r10
-; AVX512VL-NEXT:    movl $0, %r12d
-; AVX512VL-NEXT:    sbbq %rdx, %r12
-; AVX512VL-NEXT:    movl $0, %r13d
-; AVX512VL-NEXT:    sbbq %r9, %r13
-; AVX512VL-NEXT:    movl $0, %ebp
-; AVX512VL-NEXT:    sbbq %r15, %rbp
-; AVX512VL-NEXT:    movl $0, %edx
-; AVX512VL-NEXT:    sbbq %r14, %rdx
-; AVX512VL-NEXT:    movl $0, %ecx
-; AVX512VL-NEXT:    sbbq %rbx, %rcx
-; AVX512VL-NEXT:    sbbq %r11, %rdi
-; AVX512VL-NEXT:    andq %r11, %rdi
-; AVX512VL-NEXT:    andq %rbx, %rcx
-; AVX512VL-NEXT:    andq %r14, %rdx
-; AVX512VL-NEXT:    andq %r15, %rbp
-; AVX512VL-NEXT:    andq %r9, %r13
-; AVX512VL-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; AVX512VL-NEXT:    andq %r8, %r10
-; AVX512VL-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512VL-NEXT:    movq %rsi, (%rax)
-; AVX512VL-NEXT:    movq %r10, 8(%rax)
-; AVX512VL-NEXT:    movq %r12, 16(%rax)
-; AVX512VL-NEXT:    movq %r13, 24(%rax)
-; AVX512VL-NEXT:    movq %rbp, 32(%rax)
-; AVX512VL-NEXT:    movq %rdx, 40(%rax)
-; AVX512VL-NEXT:    movq %rcx, 48(%rax)
-; AVX512VL-NEXT:    movq %rdi, 56(%rax)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r12
-; AVX512VL-NEXT:    popq %r13
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
-; AVX512VL-NEXT:    popq %rbp
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovd %k0, %ecx
+; AVX512VL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovb %k0, %edx
+; AVX512VL-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VL-NEXT:    xorl %edx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VL-NEXT:    vpxorq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: blsi_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %rbp
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %r13
-; AVX512VBMI-NEXT:    pushq %r12
-; AVX512VBMI-NEXT:    pushq %rbx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq 56(%rsi), %r11
-; AVX512VBMI-NEXT:    movq 48(%rsi), %rbx
-; AVX512VBMI-NEXT:    movq 40(%rsi), %r14
-; AVX512VBMI-NEXT:    movq 32(%rsi), %r15
-; AVX512VBMI-NEXT:    movq 24(%rsi), %r9
-; AVX512VBMI-NEXT:    movq 16(%rsi), %rdx
-; AVX512VBMI-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT:    movq (%rsi), %rcx
-; AVX512VBMI-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512VBMI-NEXT:    movq 8(%rsi), %r8
-; AVX512VBMI-NEXT:    xorl %edi, %edi
-; AVX512VBMI-NEXT:    movq %rcx, %rsi
-; AVX512VBMI-NEXT:    negq %rsi
-; AVX512VBMI-NEXT:    movl $0, %r10d
-; AVX512VBMI-NEXT:    sbbq %r8, %r10
-; AVX512VBMI-NEXT:    movl $0, %r12d
-; AVX512VBMI-NEXT:    sbbq %rdx, %r12
-; AVX512VBMI-NEXT:    movl $0, %r13d
-; AVX512VBMI-NEXT:    sbbq %r9, %r13
-; AVX512VBMI-NEXT:    movl $0, %ebp
-; AVX512VBMI-NEXT:    sbbq %r15, %rbp
-; AVX512VBMI-NEXT:    movl $0, %edx
-; AVX512VBMI-NEXT:    sbbq %r14, %rdx
-; AVX512VBMI-NEXT:    movl $0, %ecx
-; AVX512VBMI-NEXT:    sbbq %rbx, %rcx
-; AVX512VBMI-NEXT:    sbbq %r11, %rdi
-; AVX512VBMI-NEXT:    andq %r11, %rdi
-; AVX512VBMI-NEXT:    andq %rbx, %rcx
-; AVX512VBMI-NEXT:    andq %r14, %rdx
-; AVX512VBMI-NEXT:    andq %r15, %rbp
-; AVX512VBMI-NEXT:    andq %r9, %r13
-; AVX512VBMI-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; AVX512VBMI-NEXT:    andq %r8, %r10
-; AVX512VBMI-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Folded Reload
-; AVX512VBMI-NEXT:    movq %rsi, (%rax)
-; AVX512VBMI-NEXT:    movq %r10, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r12, 16(%rax)
-; AVX512VBMI-NEXT:    movq %r13, 24(%rax)
-; AVX512VBMI-NEXT:    movq %rbp, 32(%rax)
-; AVX512VBMI-NEXT:    movq %rdx, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rcx, 48(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 56(%rax)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r12
-; AVX512VBMI-NEXT:    popq %r13
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
-; AVX512VBMI-NEXT:    popq %rbp
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    vptestmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %ecx
+; AVX512VBMI-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %edx
+; AVX512VBMI-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512VBMI-NEXT:    xorl %edx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512VBMI-NEXT:    vpsubq %zmm0, %zmm1, %zmm1
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
+; AVX512VBMI-NEXT:    vpxorq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
   %neg = sub i512 0, %a0
@@ -2621,62 +2505,62 @@ define i512 @blsmsk_i512_load(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: blsmsk_i512_load:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    movq 56(%rsi), %rcx
-; AVX512-NEXT:    movq 48(%rsi), %r9
-; AVX512-NEXT:    movq 40(%rsi), %r10
-; AVX512-NEXT:    movq 32(%rsi), %r11
-; AVX512-NEXT:    movq 24(%rsi), %rbx
-; AVX512-NEXT:    movq 16(%rsi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 8(%rsi), %r14
-; AVX512-NEXT:    addq $-1, %rdx
-; AVX512-NEXT:    movq %r14, %rsi
-; AVX512-NEXT:    adcq $-1, %rsi
-; AVX512-NEXT:    adcq $-1, %r15
-; AVX512-NEXT:    movq %rbx, %r12
-; AVX512-NEXT:    adcq $-1, %r12
-; AVX512-NEXT:    movq %r11, %r13
-; AVX512-NEXT:    adcq $-1, %r13
-; AVX512-NEXT:    movq %r10, %rbp
-; AVX512-NEXT:    adcq $-1, %rbp
-; AVX512-NEXT:    movq %r9, %r8
-; AVX512-NEXT:    adcq $-1, %r8
-; AVX512-NEXT:    movq %rcx, %rdi
-; AVX512-NEXT:    adcq $-1, %rdi
-; AVX512-NEXT:    xorq %rcx, %rdi
-; AVX512-NEXT:    xorq %r9, %r8
-; AVX512-NEXT:    xorq %r10, %rbp
-; AVX512-NEXT:    xorq %r11, %r13
-; AVX512-NEXT:    xorq %rbx, %r12
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; AVX512-NEXT:    xorq %r14, %rsi
-; AVX512-NEXT:    xorq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; AVX512-NEXT:    movq %rdx, (%rax)
-; AVX512-NEXT:    movq %rsi, 8(%rax)
-; AVX512-NEXT:    movq %r15, 16(%rax)
-; AVX512-NEXT:    movq %r12, 24(%rax)
-; AVX512-NEXT:    movq %r13, 32(%rax)
-; AVX512-NEXT:    movq %rbp, 40(%rax)
-; AVX512-NEXT:    movq %r8, 48(%rax)
-; AVX512-NEXT:    movq %rdi, 56(%rax)
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: blsmsk_i512_load:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: blsmsk_i512_load:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovb %k0, %ecx
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %edx
+; AVX512VL-NEXT:    leal (%rcx,%rdx,2), %edx
+; AVX512VL-NEXT:    xorl %ecx, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: blsmsk_i512_load:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %ecx
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %edx
+; AVX512VBMI-NEXT:    leal (%rcx,%rdx,2), %edx
+; AVX512VBMI-NEXT:    xorl %ecx, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vpxorq %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
   %dec = sub i512 %a0, 1
   %res = xor i512 %a0, %dec
@@ -3299,62 +3183,62 @@ define i512 @blsr_i512_load(ptr %p0) nounwind {
 ; AVX2-NEXT:    popq %rbp
 ; AVX2-NEXT:    retq
 ;
-; AVX512-LABEL: blsr_i512_load:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    movq 56(%rsi), %rcx
-; AVX512-NEXT:    movq 48(%rsi), %r9
-; AVX512-NEXT:    movq 40(%rsi), %r10
-; AVX512-NEXT:    movq 32(%rsi), %r11
-; AVX512-NEXT:    movq 24(%rsi), %rbx
-; AVX512-NEXT:    movq 16(%rsi), %r15
-; AVX512-NEXT:    movq %r15, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq (%rsi), %rdx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq 8(%rsi), %r14
-; AVX512-NEXT:    addq $-1, %rdx
-; AVX512-NEXT:    movq %r14, %rsi
-; AVX512-NEXT:    adcq $-1, %rsi
-; AVX512-NEXT:    adcq $-1, %r15
-; AVX512-NEXT:    movq %rbx, %r12
-; AVX512-NEXT:    adcq $-1, %r12
-; AVX512-NEXT:    movq %r11, %r13
-; AVX512-NEXT:    adcq $-1, %r13
-; AVX512-NEXT:    movq %r10, %rbp
-; AVX512-NEXT:    adcq $-1, %rbp
-; AVX512-NEXT:    movq %r9, %r8
-; AVX512-NEXT:    adcq $-1, %r8
-; AVX512-NEXT:    movq %rcx, %rdi
-; AVX512-NEXT:    adcq $-1, %rdi
-; AVX512-NEXT:    andq %rcx, %rdi
-; AVX512-NEXT:    andq %r9, %r8
-; AVX512-NEXT:    andq %r10, %rbp
-; AVX512-NEXT:    andq %r11, %r13
-; AVX512-NEXT:    andq %rbx, %r12
-; AVX512-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; AVX512-NEXT:    andq %r14, %rsi
-; AVX512-NEXT:    andq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Folded Reload
-; AVX512-NEXT:    movq %rdx, (%rax)
-; AVX512-NEXT:    movq %rsi, 8(%rax)
-; AVX512-NEXT:    movq %r15, 16(%rax)
-; AVX512-NEXT:    movq %r12, 24(%rax)
-; AVX512-NEXT:    movq %r13, 32(%rax)
-; AVX512-NEXT:    movq %rbp, 40(%rax)
-; AVX512-NEXT:    movq %r8, 48(%rax)
-; AVX512-NEXT:    movq %rdi, 56(%rax)
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    retq
+; AVX512F-LABEL: blsr_i512_load:
+; AVX512F:       # %bb.0:
+; AVX512F-NEXT:    movq %rdi, %rax
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512F-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512F-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512F-NEXT:    kmovw %k0, %ecx
+; AVX512F-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512F-NEXT:    kmovw %k0, %edx
+; AVX512F-NEXT:    movzbl %dl, %edx
+; AVX512F-NEXT:    leal (%rdx,%rcx,2), %ecx
+; AVX512F-NEXT:    xorl %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512F-NEXT:    retq
+;
+; AVX512VL-LABEL: blsr_i512_load:
+; AVX512VL:       # %bb.0:
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    movq %rdi, %rax
+; AVX512VL-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VL-NEXT:    kmovb %k0, %ecx
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VL-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VL-NEXT:    kmovd %k0, %edx
+; AVX512VL-NEXT:    leal (%rcx,%rdx,2), %edx
+; AVX512VL-NEXT:    xorl %ecx, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VL-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
+; AVX512VL-NEXT:    retq
+;
+; AVX512VBMI-LABEL: blsr_i512_load:
+; AVX512VBMI:       # %bb.0:
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    movq %rdi, %rax
+; AVX512VBMI-NEXT:    vptestnmq %zmm0, %zmm0, %k0
+; AVX512VBMI-NEXT:    kmovb %k0, %ecx
+; AVX512VBMI-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
+; AVX512VBMI-NEXT:    vpaddq %zmm1, %zmm0, %zmm1
+; AVX512VBMI-NEXT:    vpcmpltuq %zmm0, %zmm1, %k0
+; AVX512VBMI-NEXT:    kmovd %k0, %edx
+; AVX512VBMI-NEXT:    leal (%rcx,%rdx,2), %edx
+; AVX512VBMI-NEXT:    xorl %ecx, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vmovdqa64 %zmm0, %zmm1 {%k1}
+; AVX512VBMI-NEXT:    vpandq %zmm1, %zmm0, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
+; AVX512VBMI-NEXT:    vzeroupper
+; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
   %dec = sub i512 %a0, 1
   %res = and i512 %a0, %dec
@@ -5065,257 +4949,353 @@ define i512 @isolate_msb_i512_vector(<8 x i64> %v0, i512 %idx) nounwind {
 }
 
 define i512 @isolate_msb_i512_load(ptr %p0, i512 %idx) nounwind {
-; SSE-LABEL: isolate_msb_i512_load:
-; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    pushq %rax
-; SSE-NEXT:    movq 32(%rsi), %rcx
-; SSE-NEXT:    movq 48(%rsi), %r8
-; SSE-NEXT:    movq 16(%rsi), %r11
-; SSE-NEXT:    movq 40(%rsi), %r9
-; SSE-NEXT:    movq (%rsi), %r15
-; SSE-NEXT:    movq 8(%rsi), %r14
-; SSE-NEXT:    movq 56(%rsi), %r10
-; SSE-NEXT:    movq 24(%rsi), %rbx
-; SSE-NEXT:    movq %rbx, %rdx
-; SSE-NEXT:    orq %r10, %rdx
-; SSE-NEXT:    movq %r14, %rax
-; SSE-NEXT:    orq %r9, %rax
-; SSE-NEXT:    orq %rdx, %rax
-; SSE-NEXT:    movq %r11, %rsi
-; SSE-NEXT:    orq %r8, %rsi
-; SSE-NEXT:    movq %r15, %rdx
-; SSE-NEXT:    orq %rcx, %rdx
-; SSE-NEXT:    orq %rsi, %rdx
-; SSE-NEXT:    bsrq %r10, %rsi
-; SSE-NEXT:    xorq $63, %rsi
-; SSE-NEXT:    bsrq %r8, %r13
-; SSE-NEXT:    xorq $63, %r13
-; SSE-NEXT:    orq $64, %r13
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovneq %rsi, %r13
-; SSE-NEXT:    bsrq %r9, %rsi
-; SSE-NEXT:    xorq $63, %rsi
-; SSE-NEXT:    bsrq %rcx, %r12
-; SSE-NEXT:    xorq $63, %r12
-; SSE-NEXT:    orq $64, %r12
-; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    cmovneq %rsi, %r12
-; SSE-NEXT:    orq $128, %r12
-; SSE-NEXT:    movq %r8, %rsi
-; SSE-NEXT:    orq %r10, %rsi
-; SSE-NEXT:    cmovneq %r13, %r12
-; SSE-NEXT:    bsrq %rbx, %rsi
-; SSE-NEXT:    xorq $63, %rsi
-; SSE-NEXT:    bsrq %r11, %r13
-; SSE-NEXT:    xorq $63, %r13
-; SSE-NEXT:    orq $64, %r13
-; SSE-NEXT:    testq %rbx, %rbx
-; SSE-NEXT:    cmovneq %rsi, %r13
-; SSE-NEXT:    bsrq %r14, %rbp
-; SSE-NEXT:    xorq $63, %rbp
-; SSE-NEXT:    bsrq %r15, %rsi
-; SSE-NEXT:    xorq $63, %rsi
-; SSE-NEXT:    orq $64, %rsi
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovneq %rbp, %rsi
-; SSE-NEXT:    orq $128, %rsi
-; SSE-NEXT:    orq %rbx, %r11
-; SSE-NEXT:    cmovneq %r13, %rsi
-; SSE-NEXT:    orq $256, %rsi # imm = 0x100
-; SSE-NEXT:    orq %r10, %r9
-; SSE-NEXT:    orq %r8, %rcx
-; SSE-NEXT:    orq %r9, %rcx
-; SSE-NEXT:    cmovneq %r12, %rsi
-; SSE-NEXT:    xorps %xmm0, %xmm0
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
-; SSE-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movl %esi, %ecx
-; SSE-NEXT:    andl $63, %ecx
-; SSE-NEXT:    shrl $3, %esi
-; SSE-NEXT:    andl $56, %esi
-; SSE-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
-; SSE-NEXT:    movq -72(%rsp,%rsi), %r8
-; SSE-NEXT:    movq -80(%rsp,%rsi), %r11
-; SSE-NEXT:    movq %r11, %r9
-; SSE-NEXT:    shrdq %cl, %r8, %r9
-; SSE-NEXT:    movq -88(%rsp,%rsi), %rbx
-; SSE-NEXT:    movq %rbx, %r10
-; SSE-NEXT:    shrdq %cl, %r11, %r10
-; SSE-NEXT:    movq -96(%rsp,%rsi), %r14
-; SSE-NEXT:    movq %r14, %r11
-; SSE-NEXT:    shrdq %cl, %rbx, %r11
-; SSE-NEXT:    movq -104(%rsp,%rsi), %r15
-; SSE-NEXT:    movq %r15, %rbx
-; SSE-NEXT:    shrdq %cl, %r14, %rbx
-; SSE-NEXT:    movq -112(%rsp,%rsi), %r12
-; SSE-NEXT:    movq %r12, %r14
-; SSE-NEXT:    shrdq %cl, %r15, %r14
-; SSE-NEXT:    movq -120(%rsp,%rsi), %r13
-; SSE-NEXT:    movq %r13, %r15
-; SSE-NEXT:    shrdq %cl, %r12, %r15
-; SSE-NEXT:    movq -128(%rsp,%rsi), %rsi
-; SSE-NEXT:    shrq %cl, %r8
-; SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
-; SSE-NEXT:    shrdq %cl, %r13, %rsi
-; SSE-NEXT:    xorl %ecx, %ecx
-; SSE-NEXT:    orq %rax, %rdx
-; SSE-NEXT:    cmoveq %rcx, %r15
-; SSE-NEXT:    cmoveq %rcx, %r14
-; SSE-NEXT:    cmoveq %rcx, %rbx
-; SSE-NEXT:    cmoveq %rcx, %r11
-; SSE-NEXT:    cmoveq %rcx, %r10
-; SSE-NEXT:    cmoveq %rcx, %r9
-; SSE-NEXT:    cmoveq %rcx, %rsi
-; SSE-NEXT:    movq %rdi, %rax
-; SSE-NEXT:    cmoveq %rcx, %r8
-; SSE-NEXT:    movq %r8, 56(%rdi)
-; SSE-NEXT:    movq %r9, 48(%rdi)
-; SSE-NEXT:    movq %r10, 40(%rdi)
-; SSE-NEXT:    movq %r11, 32(%rdi)
-; SSE-NEXT:    movq %rbx, 24(%rdi)
-; SSE-NEXT:    movq %r14, 16(%rdi)
-; SSE-NEXT:    movq %r15, 8(%rdi)
-; SSE-NEXT:    movq %rsi, (%rdi)
-; SSE-NEXT:    addq $8, %rsp
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
-; SSE-NEXT:    retq
+; SSE2-LABEL: isolate_msb_i512_load:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    pushq %rax
+; SSE2-NEXT:    movq 8(%rsi), %r9
+; SSE2-NEXT:    movq 16(%rsi), %rcx
+; SSE2-NEXT:    movq 24(%rsi), %r8
+; SSE2-NEXT:    movq 48(%rsi), %rdx
+; SSE2-NEXT:    movq 56(%rsi), %r11
+; SSE2-NEXT:    movdqa 32(%rsi), %xmm1
+; SSE2-NEXT:    movdqa 48(%rsi), %xmm2
+; SSE2-NEXT:    movdqa 16(%rsi), %xmm0
+; SSE2-NEXT:    por %xmm2, %xmm0
+; SSE2-NEXT:    movdqa (%rsi), %xmm3
+; SSE2-NEXT:    por %xmm1, %xmm3
+; SSE2-NEXT:    por %xmm0, %xmm3
+; SSE2-NEXT:    pxor %xmm0, %xmm0
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm3
+; SSE2-NEXT:    movmskps %xmm3, %eax
+; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    bsrq %r11, %r10
+; SSE2-NEXT:    xorq $63, %r10
+; SSE2-NEXT:    bsrq %rdx, %rbx
+; SSE2-NEXT:    xorq $63, %rbx
+; SSE2-NEXT:    orq $64, %rbx
+; SSE2-NEXT:    testq %r11, %r11
+; SSE2-NEXT:    cmovneq %r10, %rbx
+; SSE2-NEXT:    movq 40(%rsi), %r14
+; SSE2-NEXT:    bsrq %r14, %r15
+; SSE2-NEXT:    bsrq 32(%rsi), %r10
+; SSE2-NEXT:    xorq $63, %r15
+; SSE2-NEXT:    xorq $63, %r10
+; SSE2-NEXT:    orq $64, %r10
+; SSE2-NEXT:    testq %r14, %r14
+; SSE2-NEXT:    cmovneq %r15, %r10
+; SSE2-NEXT:    orq $128, %r10
+; SSE2-NEXT:    orq %r11, %rdx
+; SSE2-NEXT:    cmovneq %rbx, %r10
+; SSE2-NEXT:    bsrq %r8, %rdx
+; SSE2-NEXT:    xorq $63, %rdx
+; SSE2-NEXT:    bsrq %rcx, %r11
+; SSE2-NEXT:    xorq $63, %r11
+; SSE2-NEXT:    orq $64, %r11
+; SSE2-NEXT:    testq %r8, %r8
+; SSE2-NEXT:    cmovneq %rdx, %r11
+; SSE2-NEXT:    bsrq %r9, %rbx
+; SSE2-NEXT:    xorq $63, %rbx
+; SSE2-NEXT:    bsrq (%rsi), %rdx
+; SSE2-NEXT:    xorq $63, %rdx
+; SSE2-NEXT:    orq $64, %rdx
+; SSE2-NEXT:    testq %r9, %r9
+; SSE2-NEXT:    cmovneq %rbx, %rdx
+; SSE2-NEXT:    orq $128, %rdx
+; SSE2-NEXT:    orq %r8, %rcx
+; SSE2-NEXT:    cmovneq %r11, %rdx
+; SSE2-NEXT:    orq $256, %rdx # imm = 0x100
+; SSE2-NEXT:    por %xmm2, %xmm1
+; SSE2-NEXT:    pcmpeqd %xmm0, %xmm1
+; SSE2-NEXT:    movmskps %xmm1, %ecx
+; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    cmovneq %r10, %rdx
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movdqa %xmm0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movl %edx, %ecx
+; SSE2-NEXT:    andl $63, %ecx
+; SSE2-NEXT:    shrl $3, %edx
+; SSE2-NEXT:    andl $56, %edx
+; SSE2-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE2-NEXT:    movq -72(%rsp,%rdx), %rsi
+; SSE2-NEXT:    movq -80(%rsp,%rdx), %r10
+; SSE2-NEXT:    movq %r10, %r8
+; SSE2-NEXT:    shrdq %cl, %rsi, %r8
+; SSE2-NEXT:    movq -88(%rsp,%rdx), %r11
+; SSE2-NEXT:    movq %r11, %r9
+; SSE2-NEXT:    shrdq %cl, %r10, %r9
+; SSE2-NEXT:    movq -96(%rsp,%rdx), %rbx
+; SSE2-NEXT:    movq %rbx, %r10
+; SSE2-NEXT:    shrdq %cl, %r11, %r10
+; SSE2-NEXT:    movq -104(%rsp,%rdx), %r14
+; SSE2-NEXT:    movq %r14, %r11
+; SSE2-NEXT:    shrdq %cl, %rbx, %r11
+; SSE2-NEXT:    movq -112(%rsp,%rdx), %r15
+; SSE2-NEXT:    movq %r15, %rbx
+; SSE2-NEXT:    shrdq %cl, %r14, %rbx
+; SSE2-NEXT:    movq -120(%rsp,%rdx), %r12
+; SSE2-NEXT:    movq %r12, %r14
+; SSE2-NEXT:    shrdq %cl, %r15, %r14
+; SSE2-NEXT:    movq -128(%rsp,%rdx), %rdx
+; SSE2-NEXT:    shrq %cl, %rsi
+; SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE2-NEXT:    shrdq %cl, %r12, %rdx
+; SSE2-NEXT:    xorl %ecx, %ecx
+; SSE2-NEXT:    testl %eax, %eax
+; SSE2-NEXT:    cmoveq %rcx, %r14
+; SSE2-NEXT:    cmoveq %rcx, %rbx
+; SSE2-NEXT:    cmoveq %rcx, %r11
+; SSE2-NEXT:    cmoveq %rcx, %r10
+; SSE2-NEXT:    cmoveq %rcx, %r9
+; SSE2-NEXT:    cmoveq %rcx, %r8
+; SSE2-NEXT:    cmoveq %rcx, %rdx
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    cmoveq %rcx, %rsi
+; SSE2-NEXT:    movq %rsi, 56(%rdi)
+; SSE2-NEXT:    movq %r8, 48(%rdi)
+; SSE2-NEXT:    movq %r9, 40(%rdi)
+; SSE2-NEXT:    movq %r10, 32(%rdi)
+; SSE2-NEXT:    movq %r11, 24(%rdi)
+; SSE2-NEXT:    movq %rbx, 16(%rdi)
+; SSE2-NEXT:    movq %r14, 8(%rdi)
+; SSE2-NEXT:    movq %rdx, (%rdi)
+; SSE2-NEXT:    addq $8, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    retq
+;
+; SSE42-LABEL: isolate_msb_i512_load:
+; SSE42:       # %bb.0:
+; SSE42-NEXT:    pushq %r15
+; SSE42-NEXT:    pushq %r14
+; SSE42-NEXT:    pushq %rbx
+; SSE42-NEXT:    movq 8(%rsi), %r8
+; SSE42-NEXT:    movq 16(%rsi), %rcx
+; SSE42-NEXT:    movq 24(%rsi), %rdx
+; SSE42-NEXT:    movq 40(%rsi), %r11
+; SSE42-NEXT:    movq 48(%rsi), %rax
+; SSE42-NEXT:    movq 56(%rsi), %r10
+; SSE42-NEXT:    movdqa 32(%rsi), %xmm2
+; SSE42-NEXT:    movdqa 48(%rsi), %xmm0
+; SSE42-NEXT:    movdqa (%rsi), %xmm1
+; SSE42-NEXT:    bsrq %r10, %r9
+; SSE42-NEXT:    xorq $63, %r9
+; SSE42-NEXT:    bsrq %rax, %rbx
+; SSE42-NEXT:    xorq $63, %rbx
+; SSE42-NEXT:    orq $64, %rbx
+; SSE42-NEXT:    testq %r10, %r10
+; SSE42-NEXT:    cmovneq %r9, %rbx
+; SSE42-NEXT:    bsrq %r11, %r14
+; SSE42-NEXT:    xorq $63, %r14
+; SSE42-NEXT:    bsrq 32(%rsi), %r9
+; SSE42-NEXT:    xorq $63, %r9
+; SSE42-NEXT:    orq $64, %r9
+; SSE42-NEXT:    testq %r11, %r11
+; SSE42-NEXT:    cmovneq %r14, %r9
+; SSE42-NEXT:    orq $128, %r9
+; SSE42-NEXT:    orq %r10, %rax
+; SSE42-NEXT:    cmovneq %rbx, %r9
+; SSE42-NEXT:    bsrq %rdx, %rax
+; SSE42-NEXT:    xorq $63, %rax
+; SSE42-NEXT:    bsrq %rcx, %r10
+; SSE42-NEXT:    xorq $63, %r10
+; SSE42-NEXT:    orq $64, %r10
+; SSE42-NEXT:    testq %rdx, %rdx
+; SSE42-NEXT:    cmovneq %rax, %r10
+; SSE42-NEXT:    por %xmm2, %xmm1
+; SSE42-NEXT:    bsrq %r8, %r11
+; SSE42-NEXT:    bsrq (%rsi), %rax
+; SSE42-NEXT:    xorq $63, %r11
+; SSE42-NEXT:    xorq $63, %rax
+; SSE42-NEXT:    orq $64, %rax
+; SSE42-NEXT:    testq %r8, %r8
+; SSE42-NEXT:    cmovneq %r11, %rax
+; SSE42-NEXT:    orq $128, %rax
+; SSE42-NEXT:    orq %rdx, %rcx
+; SSE42-NEXT:    cmovneq %r10, %rax
+; SSE42-NEXT:    orq $256, %rax # imm = 0x100
+; SSE42-NEXT:    por %xmm0, %xmm2
+; SSE42-NEXT:    ptest %xmm2, %xmm2
+; SSE42-NEXT:    cmovneq %r9, %rax
+; SSE42-NEXT:    movdqa 16(%rsi), %xmm2
+; SSE42-NEXT:    xorps %xmm3, %xmm3
+; SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movabsq $-9223372036854775808, %rcx # imm = 0x8000000000000000
+; SSE42-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movaps %xmm3, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movl %eax, %ecx
+; SSE42-NEXT:    andl $63, %ecx
+; SSE42-NEXT:    shrl $3, %eax
+; SSE42-NEXT:    andl $56, %eax
+; SSE42-NEXT:    movq $0, -{{[0-9]+}}(%rsp)
+; SSE42-NEXT:    movq -72(%rsp,%rax), %rdx
+; SSE42-NEXT:    movq -80(%rsp,%rax), %r9
+; SSE42-NEXT:    movq %r9, %rsi
+; SSE42-NEXT:    shrdq %cl, %rdx, %rsi
+; SSE42-NEXT:    movq -88(%rsp,%rax), %r10
+; SSE42-NEXT:    movq %r10, %r8
+; SSE42-NEXT:    shrdq %cl, %r9, %r8
+; SSE42-NEXT:    movq -96(%rsp,%rax), %r11
+; SSE42-NEXT:    movq %r11, %r9
+; SSE42-NEXT:    shrdq %cl, %r10, %r9
+; SSE42-NEXT:    movq -104(%rsp,%rax), %rbx
+; SSE42-NEXT:    movq %rbx, %r10
+; SSE42-NEXT:    shrdq %cl, %r11, %r10
+; SSE42-NEXT:    movq -112(%rsp,%rax), %r14
+; SSE42-NEXT:    movq %r14, %r11
+; SSE42-NEXT:    shrdq %cl, %rbx, %r11
+; SSE42-NEXT:    movq -120(%rsp,%rax), %r15
+; SSE42-NEXT:    movq %r15, %rbx
+; SSE42-NEXT:    shrdq %cl, %r14, %rbx
+; SSE42-NEXT:    movq -128(%rsp,%rax), %r14
+; SSE42-NEXT:    shrq %cl, %rdx
+; SSE42-NEXT:    # kill: def $cl killed $cl killed $ecx
+; SSE42-NEXT:    shrdq %cl, %r15, %r14
+; SSE42-NEXT:    por %xmm0, %xmm2
+; SSE42-NEXT:    por %xmm2, %xmm1
+; SSE42-NEXT:    xorl %ecx, %ecx
+; SSE42-NEXT:    ptest %xmm1, %xmm1
+; SSE42-NEXT:    cmoveq %rcx, %rbx
+; SSE42-NEXT:    cmoveq %rcx, %r11
+; SSE42-NEXT:    cmoveq %rcx, %r10
+; SSE42-NEXT:    cmoveq %rcx, %r9
+; SSE42-NEXT:    cmoveq %rcx, %r8
+; SSE42-NEXT:    cmoveq %rcx, %rsi
+; SSE42-NEXT:    cmoveq %rcx, %r14
+; SSE42-NEXT:    movq %rdi, %rax
+; SSE42-NEXT:    cmoveq %rcx, %rdx
+; SSE42-NEXT:    movq %rdx, 56(%rdi)
+; SSE42-NEXT:    movq %rsi, 48(%rdi)
+; SSE42-NEXT:    movq %r8, 40(%rdi)
+; SSE42-NEXT:    movq %r9, 32(%rdi)
+; SSE42-NEXT:    movq %r10, 24(%rdi)
+; SSE42-NEXT:    movq %r11, 16(%rdi)
+; SSE42-NEXT:    movq %rbx, 8(%rdi)
+; SSE42-NEXT:    movq %r14, (%rdi)
+; SSE42-NEXT:    popq %rbx
+; SSE42-NEXT:    popq %r14
+; SSE42-NEXT:    popq %r15
+; SSE42-NEXT:    retq
 ;
 ; AVX2-LABEL: isolate_msb_i512_load:
 ; AVX2:       # %bb.0:
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 32(%rsi), %rax
-; AVX2-NEXT:    movq 48(%rsi), %rcx
-; AVX2-NEXT:    movq 16(%rsi), %r11
-; AVX2-NEXT:    movq 40(%rsi), %r8
-; AVX2-NEXT:    movq (%rsi), %r9
-; AVX2-NEXT:    movq 8(%rsi), %r14
-; AVX2-NEXT:    movq 56(%rsi), %r10
-; AVX2-NEXT:    movq 24(%rsi), %rbx
-; AVX2-NEXT:    movq %rbx, %rsi
-; AVX2-NEXT:    orq %r10, %rsi
-; AVX2-NEXT:    movq %r14, %rdx
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    movq %r11, %r15
-; AVX2-NEXT:    orq %rcx, %r15
-; AVX2-NEXT:    movq %r9, %rsi
-; AVX2-NEXT:    orq %rax, %rsi
-; AVX2-NEXT:    orq %r15, %rsi
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    lzcntq %r10, %r15
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    lzcntq %rcx, %r12
-; AVX2-NEXT:    addq $64, %r12
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovneq %r15, %r12
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    lzcntq %r8, %r13
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    lzcntq %rax, %r15
-; AVX2-NEXT:    addq $64, %r15
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovneq %r13, %r15
-; AVX2-NEXT:    subq $-128, %r15
-; AVX2-NEXT:    movq %rcx, %r13
-; AVX2-NEXT:    orq %r10, %r13
-; AVX2-NEXT:    cmovneq %r12, %r15
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    lzcntq %rbx, %r12
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    lzcntq %r11, %r13
-; AVX2-NEXT:    addq $64, %r13
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovneq %r12, %r13
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    lzcntq %r14, %r12
-; AVX2-NEXT:    lzcntq %r9, %r9
+; AVX2-NEXT:    vmovdqu 32(%rsi), %ymm1
+; AVX2-NEXT:    movq 8(%rsi), %r8
+; AVX2-NEXT:    movq 16(%rsi), %rax
+; AVX2-NEXT:    movq 24(%rsi), %rcx
+; AVX2-NEXT:    movq 40(%rsi), %rdx
+; AVX2-NEXT:    movq 48(%rsi), %r10
+; AVX2-NEXT:    vpor (%rsi), %ymm1, %ymm0
+; AVX2-NEXT:    movq 56(%rsi), %r11
+; AVX2-NEXT:    lzcntq %r11, %r9
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r10, %rbx
+; AVX2-NEXT:    addq $64, %rbx
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovneq %r9, %rbx
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %rdx, %r14
+; AVX2-NEXT:    xorl %r9d, %r9d
+; AVX2-NEXT:    lzcntq 32(%rsi), %r9
 ; AVX2-NEXT:    addq $64, %r9
-; AVX2-NEXT:    testq %r14, %r14
-; AVX2-NEXT:    cmovneq %r12, %r9
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovneq %r14, %r9
 ; AVX2-NEXT:    subq $-128, %r9
-; AVX2-NEXT:    orq %rbx, %r11
-; AVX2-NEXT:    cmovneq %r13, %r9
-; AVX2-NEXT:    addq $256, %r9 # imm = 0x100
-; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    orq %r11, %r10
+; AVX2-NEXT:    cmovneq %rbx, %r9
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq %rcx, %rdx
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %rax, %r10
+; AVX2-NEXT:    addq $64, %r10
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovneq %rdx, %r10
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq (%rsi), %rdx
+; AVX2-NEXT:    addq $64, %rdx
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovneq %r11, %rdx
+; AVX2-NEXT:    subq $-128, %rdx
 ; AVX2-NEXT:    orq %rcx, %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    cmovneq %r15, %r9
-; AVX2-NEXT:    vxorps %xmm0, %xmm0, %xmm0
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovaps {{.*#+}} ymm1 = [0,0,0,9223372036854775808]
-; AVX2-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; AVX2-NEXT:    movl %r9d, %ecx
+; AVX2-NEXT:    cmovneq %r10, %rdx
+; AVX2-NEXT:    addq $256, %rdx # imm = 0x100
+; AVX2-NEXT:    vpor 48(%rsi), %xmm1, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovneq %r9, %rdx
+; AVX2-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX2-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovaps {{.*#+}} ymm2 = [0,0,0,9223372036854775808]
+; AVX2-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    vmovdqu %ymm1, -{{[0-9]+}}(%rsp)
+; AVX2-NEXT:    movl %edx, %ecx
 ; AVX2-NEXT:    andl $63, %ecx
-; AVX2-NEXT:    shrl $3, %r9d
-; AVX2-NEXT:    andl $56, %r9d
-; AVX2-NEXT:    movq -72(%rsp,%r9), %r14
-; AVX2-NEXT:    movq -80(%rsp,%r9), %rax
-; AVX2-NEXT:    movq %rax, %r8
-; AVX2-NEXT:    shrdq %cl, %r14, %r8
-; AVX2-NEXT:    movq -88(%rsp,%r9), %rbx
-; AVX2-NEXT:    movq %rbx, %r10
+; AVX2-NEXT:    shrl $3, %edx
+; AVX2-NEXT:    andl $56, %edx
+; AVX2-NEXT:    movq -72(%rsp,%rdx), %r11
+; AVX2-NEXT:    movq -80(%rsp,%rdx), %rax
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    shrdq %cl, %r11, %rsi
+; AVX2-NEXT:    movq -88(%rsp,%rdx), %r10
+; AVX2-NEXT:    movq %r10, %r8
+; AVX2-NEXT:    shrdq %cl, %rax, %r8
+; AVX2-NEXT:    movq -96(%rsp,%rdx), %rax
+; AVX2-NEXT:    movq %rax, %r9
+; AVX2-NEXT:    shrdq %cl, %r10, %r9
+; AVX2-NEXT:    movq -104(%rsp,%rdx), %r14
+; AVX2-NEXT:    movq %r14, %r10
 ; AVX2-NEXT:    shrdq %cl, %rax, %r10
-; AVX2-NEXT:    movq -96(%rsp,%r9), %rax
-; AVX2-NEXT:    movq %rax, %r11
-; AVX2-NEXT:    shrdq %cl, %rbx, %r11
-; AVX2-NEXT:    movq -104(%rsp,%r9), %r12
-; AVX2-NEXT:    movq %r12, %rbx
-; AVX2-NEXT:    shrdq %cl, %rax, %rbx
-; AVX2-NEXT:    movq -112(%rsp,%r9), %rax
-; AVX2-NEXT:    movq %rax, %r15
-; AVX2-NEXT:    shrdq %cl, %r12, %r15
-; AVX2-NEXT:    movq -128(%rsp,%r9), %r12
-; AVX2-NEXT:    movq -120(%rsp,%r9), %r13
-; AVX2-NEXT:    movq %r13, %r9
-; AVX2-NEXT:    shrdq %cl, %rax, %r9
-; AVX2-NEXT:    shrdq %cl, %r13, %r12
+; AVX2-NEXT:    movq -112(%rsp,%rdx), %r15
+; AVX2-NEXT:    movq %r15, %rbx
+; AVX2-NEXT:    shrdq %cl, %r14, %rbx
 ; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    xorl %edi, %edi
-; AVX2-NEXT:    orq %rdx, %rsi
-; AVX2-NEXT:    shrxq %rcx, %r14, %rcx
-; AVX2-NEXT:    cmoveq %rdi, %r9
-; AVX2-NEXT:    cmoveq %rdi, %r15
-; AVX2-NEXT:    cmoveq %rdi, %rbx
-; AVX2-NEXT:    cmoveq %rdi, %r11
-; AVX2-NEXT:    cmoveq %rdi, %r10
-; AVX2-NEXT:    cmoveq %rdi, %r8
-; AVX2-NEXT:    cmoveq %rdi, %r12
-; AVX2-NEXT:    cmoveq %rdi, %rcx
+; AVX2-NEXT:    movq -128(%rsp,%rdx), %rdi
+; AVX2-NEXT:    movq -120(%rsp,%rdx), %r14
+; AVX2-NEXT:    movq %r14, %rdx
+; AVX2-NEXT:    shrdq %cl, %r15, %rdx
+; AVX2-NEXT:    shrdq %cl, %r14, %rdi
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    shrxq %rcx, %r11, %rcx
+; AVX2-NEXT:    cmoveq %r14, %rdx
+; AVX2-NEXT:    cmoveq %r14, %rbx
+; AVX2-NEXT:    cmoveq %r14, %r10
+; AVX2-NEXT:    cmoveq %r14, %r9
+; AVX2-NEXT:    cmoveq %r14, %r8
+; AVX2-NEXT:    cmoveq %r14, %rsi
+; AVX2-NEXT:    cmoveq %r14, %rdi
+; AVX2-NEXT:    cmoveq %r14, %rcx
 ; AVX2-NEXT:    movq %rcx, 56(%rax)
-; AVX2-NEXT:    movq %r8, 48(%rax)
-; AVX2-NEXT:    movq %r10, 40(%rax)
-; AVX2-NEXT:    movq %r11, 32(%rax)
-; AVX2-NEXT:    movq %rbx, 24(%rax)
-; AVX2-NEXT:    movq %r15, 16(%rax)
-; AVX2-NEXT:    movq %r9, 8(%rax)
-; AVX2-NEXT:    movq %r12, (%rax)
+; AVX2-NEXT:    movq %rsi, 48(%rax)
+; AVX2-NEXT:    movq %r8, 40(%rax)
+; AVX2-NEXT:    movq %r9, 32(%rax)
+; AVX2-NEXT:    movq %r10, 24(%rax)
+; AVX2-NEXT:    movq %rbx, 16(%rax)
+; AVX2-NEXT:    movq %rdx, 8(%rax)
+; AVX2-NEXT:    movq %rdi, (%rax)
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    vzeroupper
@@ -5419,4 +5399,5 @@ define i512 @isolate_msb_i512_load(ptr %p0, i512 %idx) nounwind {
 }
 
 ;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
+; AVX512: {{.*}}
 ; CHECK: {{.*}}
diff --git a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
index a2395229eaace..4695f12707827 100644
--- a/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
+++ b/llvm/test/CodeGen/X86/bitcnt-big-integer.ll
@@ -899,8 +899,6 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
 ;
 ; AVX512POPCNT-LABEL: test_ctpop_i1024:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm0
-; AVX512POPCNT-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm1
 ; AVX512POPCNT-NEXT:    popcntq {{[0-9]+}}(%rsp), %rax
 ; AVX512POPCNT-NEXT:    popcntq {{[0-9]+}}(%rsp), %r10
 ; AVX512POPCNT-NEXT:    addl %eax, %r10d
@@ -916,16 +914,11 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
 ; AVX512POPCNT-NEXT:    addl %eax, %edx
 ; AVX512POPCNT-NEXT:    addl %ecx, %edx
 ; AVX512POPCNT-NEXT:    addl %r8d, %edx
-; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vpmovqb %zmm1, %xmm1
-; AVX512POPCNT-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512POPCNT-NEXT:    vpsadbw %xmm2, %xmm1, %xmm1
-; AVX512POPCNT-NEXT:    vmovd %xmm1, %ecx
-; AVX512POPCNT-NEXT:    vpopcntq %zmm0, %zmm0
+; AVX512POPCNT-NEXT:    vpopcntq {{[0-9]+}}(%rsp), %zmm0
 ; AVX512POPCNT-NEXT:    vpmovqb %zmm0, %xmm0
-; AVX512POPCNT-NEXT:    vpsadbw %xmm2, %xmm0, %xmm0
+; AVX512POPCNT-NEXT:    vpxor %xmm1, %xmm1, %xmm1
+; AVX512POPCNT-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT:    addl %ecx, %eax
 ; AVX512POPCNT-NEXT:    addl %edx, %eax
 ; AVX512POPCNT-NEXT:    retq
 ;
@@ -1002,17 +995,12 @@ define i32 @test_ctpop_i1024(i1024 %a0) nounwind {
 ; AVX512VLPOPCNT-NEXT:    popcntq %rdi, %rdx
 ; AVX512VLPOPCNT-NEXT:    addl %eax, %edx
 ; AVX512VLPOPCNT-NEXT:    addl %ecx, %edx
-; AVX512VLPOPCNT-NEXT:    vpopcntq {{[0-9]+}}(%rsp), %ymm0
-; AVX512VLPOPCNT-NEXT:    vpmovqb %ymm0, %xmm0
 ; AVX512VLPOPCNT-NEXT:    addl %r8d, %edx
+; AVX512VLPOPCNT-NEXT:    vpopcntq {{[0-9]+}}(%rsp), %zmm0
+; AVX512VLPOPCNT-NEXT:    vpmovqb %zmm0, %xmm0
 ; AVX512VLPOPCNT-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX512VLPOPCNT-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %ecx
-; AVX512VLPOPCNT-NEXT:    vpopcntq {{[0-9]+}}(%rsp), %ymm0
-; AVX512VLPOPCNT-NEXT:    vpmovqb %ymm0, %xmm0
-; AVX512VLPOPCNT-NEXT:    vpsadbw %xmm1, %xmm0, %xmm0
 ; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512VLPOPCNT-NEXT:    addl %ecx, %eax
 ; AVX512VLPOPCNT-NEXT:    addl %edx, %eax
 ; AVX512VLPOPCNT-NEXT:    vzeroupper
 ; AVX512VLPOPCNT-NEXT:    retq
@@ -1883,117 +1871,100 @@ define i32 @test_ctlz_i512(i512 %a0) nounwind {
 define i32 @load_ctlz_i512(ptr %p0) nounwind {
 ; SSE-LABEL: load_ctlz_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 8(%rdi), %r10
-; SSE-NEXT:    movq 16(%rdi), %r9
-; SSE-NEXT:    movq 32(%rdi), %rcx
-; SSE-NEXT:    movq 40(%rdi), %rdx
-; SSE-NEXT:    movq 48(%rdi), %rsi
-; SSE-NEXT:    movq 56(%rdi), %r8
-; SSE-NEXT:    bsrq %r8, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %rsi, %r14
-; SSE-NEXT:    xorl $63, %r14d
-; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    movdqa 32(%rdi), %xmm0
+; SSE-NEXT:    movq 16(%rdi), %rcx
+; SSE-NEXT:    movq 24(%rdi), %rdx
+; SSE-NEXT:    movq 40(%rdi), %r8
+; SSE-NEXT:    movq 48(%rdi), %rax
+; SSE-NEXT:    movq 56(%rdi), %r9
+; SSE-NEXT:    bsrq %r9, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rax, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    orl $64, %r10d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %esi, %r10d
+; SSE-NEXT:    bsrq %r8, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    bsrq 32(%rdi), %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
 ; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r9, %rax
+; SSE-NEXT:    cmovnel %r10d, %esi
 ; SSE-NEXT:    bsrq %rdx, %rax
 ; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %rcx, %r11
-; SSE-NEXT:    xorl $63, %r11d
-; SSE-NEXT:    orl $64, %r11d
+; SSE-NEXT:    bsrq %rcx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
 ; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %r11d
-; SSE-NEXT:    movq 24(%rdi), %rbx
-; SSE-NEXT:    subl $-128, %r11d
-; SSE-NEXT:    movq %rsi, %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    cmovnel %r14d, %r11d
-; SSE-NEXT:    bsrq %rbx, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r9, %r14
-; SSE-NEXT:    xorl $63, %r14d
-; SSE-NEXT:    orl $64, %r14d
-; SSE-NEXT:    testq %rbx, %rbx
-; SSE-NEXT:    cmovnel %eax, %r14d
-; SSE-NEXT:    bsrq %r10, %r15
-; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    movq 8(%rdi), %r9
+; SSE-NEXT:    bsrq %r9, %r10
+; SSE-NEXT:    xorl $63, %r10d
 ; SSE-NEXT:    movl $127, %eax
 ; SSE-NEXT:    bsrq (%rdi), %rax
 ; SSE-NEXT:    xorl $63, %eax
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %rbx, %r9
-; SSE-NEXT:    cmovnel %r14d, %eax
-; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    orq %rsi, %rcx
 ; SSE-NEXT:    orq %rdx, %rcx
-; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por 48(%rdi), %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %esi, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_ctlz_i512:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 8(%rdi), %r10
-; AVX2-NEXT:    movq 16(%rdi), %r9
-; AVX2-NEXT:    movq 32(%rdi), %rcx
-; AVX2-NEXT:    movq 40(%rdi), %rdx
-; AVX2-NEXT:    movq 48(%rdi), %rsi
-; AVX2-NEXT:    movq 56(%rdi), %r8
-; AVX2-NEXT:    lzcntq %r8, %rax
-; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    lzcntq %rsi, %rbx
-; AVX2-NEXT:    addl $64, %ebx
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    movq 16(%rdi), %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    movq 40(%rdi), %rax
+; AVX2-NEXT:    movq 48(%rdi), %r8
+; AVX2-NEXT:    movq 56(%rdi), %r9
+; AVX2-NEXT:    lzcntq %r9, %rsi
+; AVX2-NEXT:    lzcntq %r8, %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %esi, %r10d
+; AVX2-NEXT:    lzcntq %rax, %r11
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq 32(%rdi), %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq %rdx, %rax
-; AVX2-NEXT:    lzcntq %rcx, %r11
-; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
 ; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %r11d
-; AVX2-NEXT:    subl $-128, %r11d
-; AVX2-NEXT:    movq %rsi, %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    cmovnel %ebx, %r11d
-; AVX2-NEXT:    movq 24(%rdi), %rbx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rbx, %rax
-; AVX2-NEXT:    xorl %r14d, %r14d
-; AVX2-NEXT:    lzcntq %r9, %r14
-; AVX2-NEXT:    addl $64, %r14d
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %eax, %r14d
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    lzcntq %r10, %r15
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 8(%rdi), %r9
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq (%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %rbx, %r9
-; AVX2-NEXT:    cmovnel %r14d, %eax
-; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    orq %rsi, %rcx
 ; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vpor 48(%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    cmovnel %esi, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_ctlz_i512:
@@ -2211,445 +2182,359 @@ define i32 @vector_ctlz_i512(<16 x i32> %v0) nounwind {
 define i32 @test_ctlz_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_i1024:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %r9, %r11
-; SSE-NEXT:    movq %r8, %r9
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %rdx, %r12
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT:    bsrq %r8, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r15, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    bsrq %r14, %rdx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    bsrq %r11, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq %rax, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %r10d, %r14d
+; SSE-NEXT:    bsrq %r9, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq %r8, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    orl $64, %ebx
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r10d, %ebx
+; SSE-NEXT:    subl $-128, %ebx
+; SSE-NEXT:    movq %rax, %r10
+; SSE-NEXT:    orq %r11, %r10
+; SSE-NEXT:    cmovnel %r14d, %ebx
+; SSE-NEXT:    bsrq %rcx, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq %rdx, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r10d, %r14d
+; SSE-NEXT:    movl $127, %r10d
+; SSE-NEXT:    bsrq %rdi, %r10
+; SSE-NEXT:    bsrq %rsi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    addl $64, %r10d
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %edi, %r10d
+; SSE-NEXT:    subl $-128, %r10d
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    cmovnel %r14d, %r10d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    orq %rax, %r8
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl $256, %r10d # imm = 0x100
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %ebx, %r10d
+; SSE-NEXT:    addl $512, %r10d # imm = 0x200
+; SSE-NEXT:    bsrq %rax, %rdx
 ; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    bsrq {{[0-9]+}}(%rsp), %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    orl $64, %eax
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %r15, %rdx
-; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    movq %r8, %r14
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    bsrq %r13, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    bsrq %rbx, %rdx
+; SSE-NEXT:    bsrq %rsi, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    orl $64, %r9d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %edx, %r9d
+; SSE-NEXT:    bsrq %rdi, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    xorl $63, %edx
 ; SSE-NEXT:    orl $64, %edx
-; SSE-NEXT:    testq %r13, %r13
-; SSE-NEXT:    cmovnel %ecx, %edx
-; SSE-NEXT:    bsrq %r10, %rcx
-; SSE-NEXT:    xorl $63, %ecx
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r8d, %edx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT:    bsrq %r8, %rbp
-; SSE-NEXT:    xorl $63, %ebp
-; SSE-NEXT:    orl $64, %ebp
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %ecx, %ebp
-; SSE-NEXT:    subl $-128, %ebp
-; SSE-NEXT:    movq %rbx, %rcx
-; SSE-NEXT:    orq %r13, %rcx
-; SSE-NEXT:    cmovnel %edx, %ebp
-; SSE-NEXT:    addl $256, %ebp # imm = 0x100
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    orq %r14, %rcx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT:    orq %r15, %rdx
-; SSE-NEXT:    orq %rcx, %rdx
-; SSE-NEXT:    cmovnel %eax, %ebp
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT:    bsrq %r14, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT:    bsrq %r15, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    bsrq %r11, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r9, %rdx
-; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    orl $64, %edx
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %eax, %edx
 ; SSE-NEXT:    subl $-128, %edx
-; SSE-NEXT:    movq %r15, %rax
-; SSE-NEXT:    orq %r14, %rax
-; SSE-NEXT:    cmovnel %ecx, %edx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE-NEXT:    bsrq %r15, %rax
+; SSE-NEXT:    orq %rax, %rsi
+; SSE-NEXT:    cmovnel %r9d, %edx
+; SSE-NEXT:    bsrq %r8, %rax
 ; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r12, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r15, %r15
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    movl $127, %eax
-; SSE-NEXT:    bsrq %rdi, %rax
-; SSE-NEXT:    bsrq %rsi, %rdi
-; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    bsrq %rdi, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %esi
+; SSE-NEXT:    bsrq %rcx, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %rsi, %rsi
-; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r9d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r15, %r12
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq %r14, %r11
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    orq %r8, %rdi
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r11, %r9
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
 ; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    orq %r13, %r10
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT:    orq %rbx, %r8
-; SSE-NEXT:    addl $512, %eax # imm = 0x200
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovel %r10d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
 ; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ctlz_i1024:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %r13
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq %r9, %r14
-; AVX2-NEXT:    movq %r8, %r11
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r12, %rcx
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    lzcntq %r8, %r9
-; AVX2-NEXT:    addl $64, %r9d
-; AVX2-NEXT:    testq %r12, %r12
-; AVX2-NEXT:    cmovnel %ecx, %r9d
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    lzcntq %r10, %rsi
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %rax, %rcx
-; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r15, %rbx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r11, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r15, %r15
+; AVX2-NEXT:    cmovnel %ebx, %r12d
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    lzcntq %r10, %r13
+; AVX2-NEXT:    addl $64, %r14d
 ; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %esi, %ecx
-; AVX2-NEXT:    subl $-128, %ecx
-; AVX2-NEXT:    movq %r8, %rsi
-; AVX2-NEXT:    orq %r12, %rsi
-; AVX2-NEXT:    cmovnel %r9d, %ecx
-; AVX2-NEXT:    xorl %edi, %edi
-; AVX2-NEXT:    lzcntq %rbx, %rdi
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    lzcntq %r15, %rsi
-; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmovnel %r13d, %r14d
+; AVX2-NEXT:    subl $-128, %r14d
+; AVX2-NEXT:    orq %r15, %r11
+; AVX2-NEXT:    cmovnel %r12d, %r14d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %rbx, %r10
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %rax, %r15
+; AVX2-NEXT:    addl $64, %r15d
 ; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %edi, %esi
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    lzcntq %r13, %rbp
-; AVX2-NEXT:    addl $64, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; AVX2-NEXT:    xorl %edi, %edi
-; AVX2-NEXT:    lzcntq %r9, %rdi
-; AVX2-NEXT:    testq %r9, %r9
-; AVX2-NEXT:    cmovnel %edi, %ebp
-; AVX2-NEXT:    subl $-128, %ebp
-; AVX2-NEXT:    movq %r15, %rdi
-; AVX2-NEXT:    orq %rbx, %rdi
-; AVX2-NEXT:    cmovnel %esi, %ebp
-; AVX2-NEXT:    addl $256, %ebp # imm = 0x100
-; AVX2-NEXT:    movq %r10, %rdi
-; AVX2-NEXT:    orq %r12, %rdi
-; AVX2-NEXT:    movq %rax, %rsi
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rdi, %rsi
-; AVX2-NEXT:    cmovnel %ecx, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rdi, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r12, %rcx
-; AVX2-NEXT:    testq %r12, %r12
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r11, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    lzcntq %r14, %rsi
-; AVX2-NEXT:    testq %r14, %r14
-; AVX2-NEXT:    cmovnel %esi, %ecx
-; AVX2-NEXT:    subl $-128, %ecx
-; AVX2-NEXT:    movq %rdi, %rsi
-; AVX2-NEXT:    orq %r12, %rsi
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    movq %rdx, %rdi
-; AVX2-NEXT:    lzcntq %rdx, %rdx
-; AVX2-NEXT:    addl $64, %edx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    cmovnel %r10d, %r15d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r11, %r12
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %r12d, %r10d
+; AVX2-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    subl $-128, %r10d
+; AVX2-NEXT:    orq %rbx, %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmovnel %r15d, %r10d
+; AVX2-NEXT:    addl $256, %r10d # imm = 0x100
+; AVX2-NEXT:    vpor {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %r14d, %r10d
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %r10, %rax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %eax, %edx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    lzcntq %rbx, %rax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %r11, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r9, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %r8, %r14
+; AVX2-NEXT:    addl $64, %r14d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    subl $-128, %r14d
+; AVX2-NEXT:    movq %r11, %rax
+; AVX2-NEXT:    orq %rbx, %rax
+; AVX2-NEXT:    cmovnel %r15d, %r14d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %rdx, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %rsi, %r12
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rdi, %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT:    lzcntq %rsi, %r8
 ; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    cmovnel %r12d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %r10, %rdi
-; AVX2-NEXT:    cmovnel %edx, %eax
-; AVX2-NEXT:    orq %r12, %r14
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r14, %r11
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r15d, %eax
 ; AVX2-NEXT:    orq %rbx, %r9
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    orq %r15, %r13
+; AVX2-NEXT:    orq %r11, %r8
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r14d, %eax
 ; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r9, %r13
-; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %r10d, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
 ; AVX2-NEXT:    popq %r13
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test_ctlz_i1024:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT:    vmovq %rdi, %xmm0
-; AVX512F-NEXT:    vmovq %rsi, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovq %rdx, %xmm1
-; AVX512F-NEXT:    vmovq %rcx, %xmm2
+; AVX512F-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT:    vmovq %rdi, %xmm1
+; AVX512F-NEXT:    vmovq %rsi, %xmm2
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vmovq %r8, %xmm1
-; AVX512F-NEXT:    vmovq %r9, %xmm3
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vmovq %rcx, %xmm3
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm3 = mem[2,3,0,1]
 ; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovd %xmm0, %ecx
+; AVX512F-NEXT:    vmovq %r8, %xmm2
+; AVX512F-NEXT:    vmovq %r9, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %ecx
 ; AVX512F-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512F-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT:    vmovd %xmm0, %eax
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    orq %r14, %r11
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    orq %rbx, %r10
-; AVX512F-NEXT:    orq %r11, %r10
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
+; AVX512F-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512F-NEXT:    vptest %ymm0, %ymm0
 ; AVX512F-NEXT:    cmovel %ecx, %eax
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: test_ctlz_i1024:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    pushq %r14
-; AVX512POPCNT-NEXT:    pushq %rbx
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm0
-; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm1
-; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm1
-; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm2
 ; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512POPCNT-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
-; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512POPCNT-NEXT:    vmovq %r8, %xmm1
-; AVX512POPCNT-NEXT:    vmovq %r9, %xmm3
-; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm3[0],xmm1[0]
+; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm2
+; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm3
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
+; AVX512POPCNT-NEXT:    vpshufd {{.*#+}} xmm3 = mem[2,3,0,1]
 ; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
-; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %ecx
+; AVX512POPCNT-NEXT:    vmovq %r8, %xmm2
+; AVX512POPCNT-NEXT:    vmovq %r9, %xmm4
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm4[0],xmm2[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm2, %ymm3, %ymm2
+; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512POPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %ecx
 ; AVX512POPCNT-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT:    orq %r14, %r11
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT:    orq %rbx, %r10
-; AVX512POPCNT-NEXT:    orq %r11, %r10
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %eax
+; AVX512POPCNT-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vptest %ymm0, %ymm0
 ; AVX512POPCNT-NEXT:    cmovel %ecx, %eax
-; AVX512POPCNT-NEXT:    popq %rbx
-; AVX512POPCNT-NEXT:    popq %r14
 ; AVX512POPCNT-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_ctlz_i1024:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT:    vmovq %rdi, %xmm0
-; AVX512VL-NEXT:    vmovq %rsi, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vmovq %rdx, %xmm1
-; AVX512VL-NEXT:    vmovq %rcx, %xmm2
+; AVX512VL-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VL-NEXT:    vmovq %rdi, %xmm1
+; AVX512VL-NEXT:    vmovq %rsi, %xmm2
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VL-NEXT:    vmovq %r8, %xmm2
-; AVX512VL-NEXT:    vmovq %r9, %xmm3
+; AVX512VL-NEXT:    vmovq %rdx, %xmm2
+; AVX512VL-NEXT:    vmovq %rcx, %xmm3
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT:    vmovd %xmm0, %ecx
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VL-NEXT:    vmovq %r8, %xmm3
+; AVX512VL-NEXT:    vmovq %r9, %xmm4
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm1, %ecx
 ; AVX512VL-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm0, %eax
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT:    orq %r14, %r11
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    orq %rbx, %r10
-; AVX512VL-NEXT:    orq %r11, %r10
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VL-NEXT:    vptest %ymm0, %ymm0
 ; AVX512VL-NEXT:    cmovel %ecx, %eax
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLPOPCNT-LABEL: test_ctlz_i1024:
 ; AVX512VLPOPCNT:       # %bb.0:
-; AVX512VLPOPCNT-NEXT:    pushq %r14
-; AVX512VLPOPCNT-NEXT:    pushq %rbx
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT:    vmovq %rdi, %xmm0
-; AVX512VLPOPCNT-NEXT:    vmovq %rsi, %xmm1
-; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VLPOPCNT-NEXT:    vmovq %rdx, %xmm1
-; AVX512VLPOPCNT-NEXT:    vmovq %rcx, %xmm2
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VLPOPCNT-NEXT:    vmovq %rdi, %xmm1
+; AVX512VLPOPCNT-NEXT:    vmovq %rsi, %xmm2
 ; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VLPOPCNT-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VLPOPCNT-NEXT:    vmovq %r8, %xmm2
-; AVX512VLPOPCNT-NEXT:    vmovq %r9, %xmm3
+; AVX512VLPOPCNT-NEXT:    vmovq %rdx, %xmm2
+; AVX512VLPOPCNT-NEXT:    vmovq %rcx, %xmm3
 ; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VLPOPCNT-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %ecx
+; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VLPOPCNT-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VLPOPCNT-NEXT:    vmovq %r8, %xmm3
+; AVX512VLPOPCNT-NEXT:    vmovq %r9, %xmm4
+; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VLPOPCNT-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %ecx
 ; AVX512VLPOPCNT-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT:    orq %r14, %r11
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT:    orq %rbx, %r10
-; AVX512VLPOPCNT-NEXT:    orq %r11, %r10
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %eax
+; AVX512VLPOPCNT-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    vptest %ymm0, %ymm0
 ; AVX512VLPOPCNT-NEXT:    cmovel %ecx, %eax
-; AVX512VLPOPCNT-NEXT:    popq %rbx
-; AVX512VLPOPCNT-NEXT:    popq %r14
 ; AVX512VLPOPCNT-NEXT:    vzeroupper
 ; AVX512VLPOPCNT-NEXT:    retq
   %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 0)
@@ -2660,390 +2545,305 @@ define i32 @test_ctlz_i1024(i1024 %a0) nounwind {
 define i32 @load_ctlz_i1024(ptr %p0) nounwind {
 ; SSE-LABEL: load_ctlz_i1024:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 40(%rdi), %rbp
-; SSE-NEXT:    movq 64(%rdi), %rbx
-; SSE-NEXT:    movq 72(%rdi), %r11
-; SSE-NEXT:    movq 80(%rdi), %r12
-; SSE-NEXT:    movq 88(%rdi), %r14
-; SSE-NEXT:    movq 96(%rdi), %rsi
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    movq 72(%rdi), %rax
 ; SSE-NEXT:    movq 104(%rdi), %r9
-; SSE-NEXT:    movq 112(%rdi), %r10
+; SSE-NEXT:    movq 112(%rdi), %rdx
 ; SSE-NEXT:    movq 120(%rdi), %r8
-; SSE-NEXT:    bsrq %r8, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r10, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
+; SSE-NEXT:    bsrq %r8, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rdx, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
 ; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    bsrq %r9, %rdx
-; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    bsrq %rsi, %rax
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    cmovnel %esi, %r11d
+; SSE-NEXT:    bsrq %r9, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq 96(%rdi), %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
 ; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %r10, %rdx
-; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE-NEXT:    cmovnel %r10d, %esi
+; SSE-NEXT:    movq 80(%rdi), %r9
+; SSE-NEXT:    movq 88(%rdi), %r10
+; SSE-NEXT:    subl $-128, %esi
 ; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    bsrq %r14, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    bsrq %r12, %rdx
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    bsrq %r10, %rdx
 ; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    orl $64, %edx
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %ecx, %edx
-; SSE-NEXT:    bsrq %r11, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    bsrq %rbx, %r15
-; SSE-NEXT:    xorl $63, %r15d
-; SSE-NEXT:    orl $64, %r15d
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %ecx, %r15d
-; SSE-NEXT:    subl $-128, %r15d
-; SSE-NEXT:    movq %r12, %rcx
-; SSE-NEXT:    orq %r14, %rcx
-; SSE-NEXT:    cmovnel %edx, %r15d
-; SSE-NEXT:    movq 48(%rdi), %r12
-; SSE-NEXT:    addl $256, %r15d # imm = 0x100
-; SSE-NEXT:    movq %r9, %rcx
-; SSE-NEXT:    orq %r8, %rcx
-; SSE-NEXT:    movq %rsi, %rdx
-; SSE-NEXT:    orq %r10, %rdx
-; SSE-NEXT:    orq %rcx, %rdx
-; SSE-NEXT:    movq 56(%rdi), %r13
-; SSE-NEXT:    cmovnel %eax, %r15d
-; SSE-NEXT:    bsrq %r13, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r12, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r13, %r13
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    movq %rbp, %r10
-; SSE-NEXT:    bsrq %rbp, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    movq 32(%rdi), %r8
-; SSE-NEXT:    bsrq %r8, %rbp
-; SSE-NEXT:    xorl $63, %ebp
-; SSE-NEXT:    orl $64, %ebp
+; SSE-NEXT:    bsrq %r9, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
 ; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %eax, %ebp
-; SSE-NEXT:    subl $-128, %ebp
-; SSE-NEXT:    movq %r12, %rax
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    cmovnel %ecx, %ebp
-; SSE-NEXT:    movq 24(%rdi), %r9
-; SSE-NEXT:    bsrq %r9, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    movq 16(%rdi), %rsi
-; SSE-NEXT:    bsrq %rsi, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    movl $127, %eax
-; SSE-NEXT:    bsrq (%rdi), %rax
-; SSE-NEXT:    movq 8(%rdi), %rdi
-; SSE-NEXT:    bsrq %rdi, %rdx
+; SSE-NEXT:    cmovnel %edx, %r11d
+; SSE-NEXT:    bsrq %rax, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    bsrq 64(%rdi), %rdx
 ; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %rdi, %rdi
-; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    movq 40(%rdi), %r8
+; SSE-NEXT:    cmovnel %ebx, %edx
+; SSE-NEXT:    movq 48(%rdi), %rax
+; SSE-NEXT:    movdqa 112(%rdi), %xmm0
+; SSE-NEXT:    movdqa 96(%rdi), %xmm1
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    orq %r10, %r9
+; SSE-NEXT:    cmovnel %r11d, %edx
+; SSE-NEXT:    addl $256, %edx # imm = 0x100
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    movq 56(%rdi), %r10
+; SSE-NEXT:    cmovnel %esi, %edx
+; SSE-NEXT:    bsrq %r10, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rax, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %esi, %r11d
+; SSE-NEXT:    bsrq %r8, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq 32(%rdi), %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq 16(%rdi), %r8
+; SSE-NEXT:    movq 24(%rdi), %r9
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r10, %rax
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    bsrq %r9, %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    bsrq %r8, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    orl $64, %r10d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %r10d
+; SSE-NEXT:    bsrq %rcx, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    movl $127, %eax
+; SSE-NEXT:    bsrq (%rdi), %rax
+; SSE-NEXT:    xorl $63, %eax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r11d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %rsi
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq %r13, %r10
-; SSE-NEXT:    orq %r12, %r8
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %r10d, %eax
+; SSE-NEXT:    movdqa 32(%rdi), %xmm2
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    cmovnel %ebp, %eax
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT:    orq %r14, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %rcx, %rbx
+; SSE-NEXT:    por 48(%rdi), %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    por 80(%rdi), %xmm0
+; SSE-NEXT:    por 64(%rdi), %xmm1
 ; SSE-NEXT:    addl $512, %eax # imm = 0x200
-; SSE-NEXT:    orq %r11, %rbx
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_ctlz_i1024:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 48(%rdi), %r9
-; AVX2-NEXT:    movq 56(%rdi), %rbp
-; AVX2-NEXT:    movq 64(%rdi), %r11
-; AVX2-NEXT:    movq 72(%rdi), %r10
-; AVX2-NEXT:    movq 80(%rdi), %r14
-; AVX2-NEXT:    movq 88(%rdi), %rbx
-; AVX2-NEXT:    movq 96(%rdi), %rdx
-; AVX2-NEXT:    movq 104(%rdi), %r8
-; AVX2-NEXT:    movq 112(%rdi), %rsi
-; AVX2-NEXT:    movq 120(%rdi), %r15
-; AVX2-NEXT:    lzcntq %r15, %rax
-; AVX2-NEXT:    lzcntq %rsi, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    testq %r15, %r15
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    lzcntq %r8, %r12
+; AVX2-NEXT:    movq 16(%rdi), %rcx
+; AVX2-NEXT:    movq 72(%rdi), %rsi
+; AVX2-NEXT:    movq 104(%rdi), %rdx
+; AVX2-NEXT:    movq 112(%rdi), %r8
+; AVX2-NEXT:    movq 120(%rdi), %r10
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r8, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    lzcntq %rdx, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rdx, %rax
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    lzcntq 96(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    movq 80(%rdi), %r9
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    movq 88(%rdi), %r11
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    movq %rsi, %r12
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    orq %r15, %r12
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %rbx, %rcx
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    lzcntq %r14, %r13
-; AVX2-NEXT:    addl $64, %r13d
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %ecx, %r13d
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r10, %rcx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    lzcntq %r11, %r12
-; AVX2-NEXT:    addl $64, %r12d
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %ecx, %r12d
-; AVX2-NEXT:    subl $-128, %r12d
-; AVX2-NEXT:    movq %r14, %rcx
-; AVX2-NEXT:    orq %rbx, %rcx
-; AVX2-NEXT:    cmovnel %r13d, %r12d
-; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
-; AVX2-NEXT:    movq %r8, %rcx
-; AVX2-NEXT:    orq %r15, %rcx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %r12d
-; AVX2-NEXT:    movq %rbp, %r14
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %rbp, %rcx
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq %r11, %rdx
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r9, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %edx, %ebx
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq 64(%rdi), %rdx
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rsi, %r8
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    movq 40(%rdi), %r10
+; AVX2-NEXT:    cmovnel %r8d, %edx
+; AVX2-NEXT:    movq 48(%rdi), %r8
+; AVX2-NEXT:    subl $-128, %edx
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    movq 56(%rdi), %r9
+; AVX2-NEXT:    cmovnel %ebx, %edx
+; AVX2-NEXT:    vmovdqu 96(%rdi), %ymm0
+; AVX2-NEXT:    addl $256, %edx # imm = 0x100
+; AVX2-NEXT:    vpor 112(%rdi), %xmm0, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %eax, %edx
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq %r9, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %rbp, %rbp
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    movq 32(%rdi), %r13
-; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    lzcntq %r13, %rbp
-; AVX2-NEXT:    addl $64, %ebp
-; AVX2-NEXT:    movq 40(%rdi), %r8
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    lzcntq %r8, %rdx
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %edx, %ebp
-; AVX2-NEXT:    subl $-128, %ebp
-; AVX2-NEXT:    movq %r9, %rdx
-; AVX2-NEXT:    orq %r14, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ebp
-; AVX2-NEXT:    movq 16(%rdi), %r9
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r9, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq 32(%rdi), %rsi
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rdx, %rax
-; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    movq 24(%rdi), %r10
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 8(%rdi), %r9
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    lzcntq %r9, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq (%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    lzcntq %rsi, %rdi
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r11d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %rdx, %r9
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq %r14, %r8
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %r10, %rcx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r8, %r13
-; AVX2-NEXT:    cmovnel %ebp, %eax
-; AVX2-NEXT:    orq %r15, %rbx
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbx, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rcx, %r11
+; AVX2-NEXT:    vpor 48(%rdi), %xmm1, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    vpor 64(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r10, %r11
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %edx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_ctlz_i1024:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq 80(%rdi), %rsi
-; AVX512F-NEXT:    movq 64(%rdi), %rcx
-; AVX512F-NEXT:    movq 72(%rdi), %rdx
-; AVX512F-NEXT:    movq 88(%rdi), %r8
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm2, %ecx
+; AVX512F-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vmovd %xmm1, %r9d
-; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512F-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovd %xmm0, %eax
-; AVX512F-NEXT:    orq 120(%rdi), %r8
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512F-NEXT:    orq 104(%rdi), %rdx
-; AVX512F-NEXT:    orq %r8, %rdx
-; AVX512F-NEXT:    orq 112(%rdi), %rsi
-; AVX512F-NEXT:    orq 96(%rdi), %rcx
-; AVX512F-NEXT:    orq %rsi, %rcx
-; AVX512F-NEXT:    orq %rdx, %rcx
-; AVX512F-NEXT:    cmovnel %r9d, %eax
+; AVX512F-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vptest %ymm0, %ymm0
+; AVX512F-NEXT:    cmovnel %ecx, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: load_ctlz_i1024:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    movq 80(%rdi), %rsi
-; AVX512POPCNT-NEXT:    movq 64(%rdi), %rcx
-; AVX512POPCNT-NEXT:    movq 72(%rdi), %rdx
-; AVX512POPCNT-NEXT:    movq 88(%rdi), %r8
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512POPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm2, %ecx
+; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512POPCNT-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT:    vmovd %xmm1, %r9d
-; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT:    orq 120(%rdi), %r8
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512POPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT:    orq 104(%rdi), %rdx
-; AVX512POPCNT-NEXT:    orq %r8, %rdx
-; AVX512POPCNT-NEXT:    orq 112(%rdi), %rsi
-; AVX512POPCNT-NEXT:    orq 96(%rdi), %rcx
-; AVX512POPCNT-NEXT:    orq %rsi, %rcx
-; AVX512POPCNT-NEXT:    orq %rdx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %r9d, %eax
+; AVX512POPCNT-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512POPCNT-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_ctlz_i1024:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    movq 80(%rdi), %rsi
-; AVX512VL-NEXT:    movq 64(%rdi), %rcx
-; AVX512VL-NEXT:    movq 72(%rdi), %rdx
-; AVX512VL-NEXT:    movq 88(%rdi), %r8
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512VL-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm2, %ecx
+; AVX512VL-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm1, %r9d
-; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
 ; AVX512VL-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT:    orq 120(%rdi), %r8
-; AVX512VL-NEXT:    orq 104(%rdi), %rdx
-; AVX512VL-NEXT:    orq 112(%rdi), %rsi
-; AVX512VL-NEXT:    orq %r8, %rdx
-; AVX512VL-NEXT:    orq 96(%rdi), %rcx
-; AVX512VL-NEXT:    orq %rsi, %rcx
-; AVX512VL-NEXT:    orq %rdx, %rcx
-; AVX512VL-NEXT:    cmovnel %r9d, %eax
+; AVX512VL-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT:    vptest %ymm0, %ymm0
+; AVX512VL-NEXT:    cmovnel %ecx, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLPOPCNT-LABEL: load_ctlz_i1024:
 ; AVX512VLPOPCNT:       # %bb.0:
-; AVX512VLPOPCNT-NEXT:    movq 80(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    movq 64(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    movq 72(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    movq 88(%rdi), %r8
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm2, %ecx
+; AVX512VLPOPCNT-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %r9d
-; AVX512VLPOPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLPOPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT:    orq 120(%rdi), %r8
-; AVX512VLPOPCNT-NEXT:    orq 104(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    orq 112(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    orq %r8, %rdx
-; AVX512VLPOPCNT-NEXT:    orq 96(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    orq %rsi, %rcx
-; AVX512VLPOPCNT-NEXT:    orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT:    cmovnel %r9d, %eax
+; AVX512VLPOPCNT-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512VLPOPCNT-NEXT:    vzeroupper
 ; AVX512VLPOPCNT-NEXT:    retq
   %a0 = load i1024, ptr %p0
@@ -3683,116 +3483,101 @@ define i32 @test_ctlz_undef_i512(i512 %a0) nounwind {
 define i32 @load_ctlz_undef_i512(ptr %p0) nounwind {
 ; SSE-LABEL: load_ctlz_undef_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 8(%rdi), %r11
-; SSE-NEXT:    movq 16(%rdi), %r9
-; SSE-NEXT:    movq 24(%rdi), %r10
-; SSE-NEXT:    movq 32(%rdi), %rcx
-; SSE-NEXT:    movq 40(%rdi), %rdx
-; SSE-NEXT:    movq 48(%rdi), %rsi
-; SSE-NEXT:    movq 56(%rdi), %r8
-; SSE-NEXT:    bsrq %r8, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %rsi, %r14
-; SSE-NEXT:    xorl $63, %r14d
-; SSE-NEXT:    orl $64, %r14d
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %eax, %r14d
+; SSE-NEXT:    movdqa 32(%rdi), %xmm0
+; SSE-NEXT:    movq 8(%rdi), %rsi
+; SSE-NEXT:    movq 16(%rdi), %rcx
+; SSE-NEXT:    movq 24(%rdi), %rdx
+; SSE-NEXT:    movq 48(%rdi), %rax
+; SSE-NEXT:    movq 56(%rdi), %r9
+; SSE-NEXT:    bsrq %r9, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq %rax, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    orl $64, %r10d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r8d, %r10d
+; SSE-NEXT:    movq 40(%rdi), %r11
+; SSE-NEXT:    bsrq %r11, %rbx
+; SSE-NEXT:    bsrq 32(%rdi), %r8
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %ebx, %r8d
+; SSE-NEXT:    subl $-128, %r8d
+; SSE-NEXT:    orq %r9, %rax
+; SSE-NEXT:    cmovnel %r10d, %r8d
 ; SSE-NEXT:    bsrq %rdx, %rax
 ; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %rcx, %rbx
-; SSE-NEXT:    xorl $63, %ebx
-; SSE-NEXT:    orl $64, %ebx
+; SSE-NEXT:    bsrq %rcx, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    orl $64, %r9d
 ; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %ebx
-; SSE-NEXT:    subl $-128, %ebx
-; SSE-NEXT:    movq %rsi, %rax
-; SSE-NEXT:    orq %r8, %rax
-; SSE-NEXT:    cmovnel %r14d, %ebx
-; SSE-NEXT:    bsrq %r10, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r9, %r14
-; SSE-NEXT:    xorl $63, %r14d
-; SSE-NEXT:    orl $64, %r14d
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %eax, %r14d
-; SSE-NEXT:    bsrq %r11, %r15
-; SSE-NEXT:    xorl $63, %r15d
+; SSE-NEXT:    cmovnel %eax, %r9d
+; SSE-NEXT:    bsrq %rsi, %r10
+; SSE-NEXT:    xorl $63, %r10d
 ; SSE-NEXT:    bsrq (%rdi), %rax
 ; SSE-NEXT:    xorl $63, %eax
 ; SSE-NEXT:    orl $64, %eax
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r10d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r10, %r9
-; SSE-NEXT:    cmovnel %r14d, %eax
-; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    orq %rsi, %rcx
 ; SSE-NEXT:    orq %rdx, %rcx
-; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    cmovnel %r9d, %eax
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por 48(%rdi), %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %r8d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_ctlz_undef_i512:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 8(%rdi), %r10
-; AVX2-NEXT:    movq 16(%rdi), %r9
-; AVX2-NEXT:    movq 32(%rdi), %rcx
-; AVX2-NEXT:    movq 40(%rdi), %rdx
-; AVX2-NEXT:    movq 48(%rdi), %rsi
-; AVX2-NEXT:    movq 56(%rdi), %r8
-; AVX2-NEXT:    lzcntq %r8, %rax
-; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    lzcntq %rsi, %rbx
-; AVX2-NEXT:    addl $64, %ebx
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    movq 16(%rdi), %rcx
+; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    movq 40(%rdi), %rax
+; AVX2-NEXT:    movq 48(%rdi), %r8
+; AVX2-NEXT:    movq 56(%rdi), %r9
+; AVX2-NEXT:    lzcntq %r9, %rsi
+; AVX2-NEXT:    lzcntq %r8, %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %esi, %r10d
+; AVX2-NEXT:    lzcntq %rax, %r11
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq 32(%rdi), %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq %rdx, %rax
-; AVX2-NEXT:    lzcntq %rcx, %r11
-; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
 ; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %r11d
-; AVX2-NEXT:    subl $-128, %r11d
-; AVX2-NEXT:    movq %rsi, %rax
-; AVX2-NEXT:    orq %r8, %rax
-; AVX2-NEXT:    cmovnel %ebx, %r11d
-; AVX2-NEXT:    movq 24(%rdi), %rbx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rbx, %rax
-; AVX2-NEXT:    xorl %r14d, %r14d
-; AVX2-NEXT:    lzcntq %r9, %r14
-; AVX2-NEXT:    addl $64, %r14d
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %eax, %r14d
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    lzcntq %r10, %r15
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 8(%rdi), %r9
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %r9, %r10
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq (%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %rbx, %r9
-; AVX2-NEXT:    cmovnel %r14d, %eax
-; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r8, %rdx
-; AVX2-NEXT:    orq %rsi, %rcx
 ; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm0
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    vpor 48(%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    cmovnel %esi, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_ctlz_undef_i512:
@@ -4001,443 +3786,354 @@ define i32 @vector_ctlz_undef_i512(<16 x i32> %v0) nounwind {
 define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-LABEL: test_ctlz_undef_i1024:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
 ; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %r9, %r12
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    bsrq %r10, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    bsrq %rax, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    orl $64, %ebx
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r11d, %ebx
+; SSE-NEXT:    bsrq %r9, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    bsrq %r8, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r14d, %r11d
+; SSE-NEXT:    subl $-128, %r11d
+; SSE-NEXT:    movq %rax, %r14
+; SSE-NEXT:    orq %r10, %r14
+; SSE-NEXT:    cmovnel %ebx, %r11d
+; SSE-NEXT:    bsrq %rcx, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    bsrq %rdx, %r14
+; SSE-NEXT:    xorl $63, %r14d
+; SSE-NEXT:    orl $64, %r14d
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %ebx, %r14d
+; SSE-NEXT:    bsrq %rsi, %rbx
+; SSE-NEXT:    xorl $63, %ebx
+; SSE-NEXT:    bsrq %rdi, %rdi
+; SSE-NEXT:    xorl $63, %edi
+; SSE-NEXT:    orl $64, %edi
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %ebx, %edi
+; SSE-NEXT:    subl $-128, %edi
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    cmovnel %r14d, %edi
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE-NEXT:    orq %r10, %r9
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE-NEXT:    bsrq %r11, %rax
+; SSE-NEXT:    orq %rax, %r8
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    addl $256, %edi # imm = 0x100
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %r11d, %edi
+; SSE-NEXT:    addl $512, %edi # imm = 0x200
+; SSE-NEXT:    bsrq %rax, %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    bsrq %rsi, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    orl $64, %r10d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %edx, %r10d
+; SSE-NEXT:    bsrq %rbx, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    bsrq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    xorl $63, %edx
+; SSE-NEXT:    orl $64, %edx
+; SSE-NEXT:    testq %rbx, %rbx
+; SSE-NEXT:    cmovnel %r8d, %edx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    orq %rax, %rsi
+; SSE-NEXT:    cmovnel %r10d, %edx
+; SSE-NEXT:    bsrq %r9, %rax
 ; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %rsi, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    bsrq %rdx, %r10
+; SSE-NEXT:    bsrq %r8, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %esi
+; SSE-NEXT:    bsrq %rcx, %r10
 ; SSE-NEXT:    xorl $63, %r10d
 ; SSE-NEXT:    bsrq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    xorl $63, %eax
 ; SSE-NEXT:    orl $64, %eax
-; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    testq %rcx, %rcx
 ; SSE-NEXT:    cmovnel %r10d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %rsi, %r9
-; SSE-NEXT:    movq %rsi, %rbx
-; SSE-NEXT:    orq %r11, %r9
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    bsrq %r15, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    bsrq %r13, %rsi
-; SSE-NEXT:    xorl $63, %esi
-; SSE-NEXT:    orl $64, %esi
-; SSE-NEXT:    testq %r15, %r15
-; SSE-NEXT:    cmovnel %ecx, %esi
-; SSE-NEXT:    bsrq %r14, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; SSE-NEXT:    bsrq %r9, %rbp
-; SSE-NEXT:    xorl $63, %ebp
-; SSE-NEXT:    orl $64, %ebp
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %ecx, %ebp
-; SSE-NEXT:    movq %r8, %r10
-; SSE-NEXT:    subl $-128, %ebp
-; SSE-NEXT:    movq %r13, %rcx
-; SSE-NEXT:    orq %r15, %rcx
-; SSE-NEXT:    cmovnel %esi, %ebp
-; SSE-NEXT:    addl $256, %ebp # imm = 0x100
-; SSE-NEXT:    orq %r11, %rdx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE-NEXT:    orq %rbx, %rsi
-; SSE-NEXT:    orq %rdx, %rsi
-; SSE-NEXT:    cmovnel %eax, %ebp
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT:    bsrq %rdx, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT:    bsrq %r8, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    orl $64, %eax
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    bsrq %r12, %rsi
-; SSE-NEXT:    xorl $63, %esi
-; SSE-NEXT:    bsrq %r10, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r12, %r12
-; SSE-NEXT:    cmovnel %esi, %ecx
-; SSE-NEXT:    movq %rdi, %rbx
-; SSE-NEXT:    subl $-128, %ecx
-; SSE-NEXT:    movq %r8, %rsi
-; SSE-NEXT:    orq %rdx, %rsi
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE-NEXT:    bsrq %r11, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE-NEXT:    bsrq %r8, %rdx
-; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    orl $64, %edx
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %eax, %edx
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; SSE-NEXT:    bsrq %rdi, %rsi
-; SSE-NEXT:    xorl $63, %esi
-; SSE-NEXT:    bsrq %rbx, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    orl $64, %eax
-; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    orq %r9, %r8
 ; SSE-NEXT:    cmovnel %esi, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r12
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm1
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r12, %r10
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r15
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT:    orq %r15, %r14
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r9
-; SSE-NEXT:    orq %r13, %r9
-; SSE-NEXT:    addl $512, %eax # imm = 0x200
-; SSE-NEXT:    orq %r14, %r9
-; SSE-NEXT:    cmovnel %ebp, %eax
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %edx, %eax
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm1
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovel %edi, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
 ; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: test_ctlz_undef_i1024:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
 ; AVX2-NEXT:    pushq %r15
 ; AVX2-NEXT:    pushq %r14
 ; AVX2-NEXT:    pushq %r13
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq %r9, %r14
-; AVX2-NEXT:    movq %r8, %r11
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r12, %rcx
-; AVX2-NEXT:    xorl %r9d, %r9d
-; AVX2-NEXT:    lzcntq %r8, %r9
-; AVX2-NEXT:    addl $64, %r9d
-; AVX2-NEXT:    testq %r12, %r12
-; AVX2-NEXT:    cmovnel %ecx, %r9d
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    lzcntq %r10, %rsi
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %rax, %rcx
-; AVX2-NEXT:    addl $64, %ecx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r15, %rbx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r11, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r15, %r15
+; AVX2-NEXT:    cmovnel %ebx, %r12d
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    lzcntq %r10, %r13
+; AVX2-NEXT:    addl $64, %r14d
 ; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %esi, %ecx
-; AVX2-NEXT:    subl $-128, %ecx
-; AVX2-NEXT:    movq %r8, %rsi
-; AVX2-NEXT:    orq %r12, %rsi
-; AVX2-NEXT:    cmovnel %r9d, %ecx
-; AVX2-NEXT:    xorl %edi, %edi
-; AVX2-NEXT:    lzcntq %rbx, %rdi
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    lzcntq %r15, %rsi
-; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmovnel %r13d, %r14d
+; AVX2-NEXT:    subl $-128, %r14d
+; AVX2-NEXT:    orq %r15, %r11
+; AVX2-NEXT:    cmovnel %r12d, %r14d
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq %rbx, %r10
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %rax, %r15
+; AVX2-NEXT:    addl $64, %r15d
 ; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %edi, %esi
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    lzcntq %r13, %rbp
-; AVX2-NEXT:    addl $64, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; AVX2-NEXT:    xorl %edi, %edi
-; AVX2-NEXT:    lzcntq %r9, %rdi
+; AVX2-NEXT:    cmovnel %r10d, %r15d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %r11, %r12
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    lzcntq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %r12d, %r10d
+; AVX2-NEXT:    vmovdqu {{[0-9]+}}(%rsp), %ymm0
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    subl $-128, %r10d
+; AVX2-NEXT:    orq %rbx, %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmovnel %r15d, %r10d
+; AVX2-NEXT:    addl $256, %r10d # imm = 0x100
+; AVX2-NEXT:    vpor {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %r14d, %r10d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %rbx, %rax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %r11, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rbx, %rbx
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r9, %rax
+; AVX2-NEXT:    xorl %r14d, %r14d
+; AVX2-NEXT:    lzcntq %r8, %r14
+; AVX2-NEXT:    addl $64, %r14d
 ; AVX2-NEXT:    testq %r9, %r9
-; AVX2-NEXT:    cmovnel %edi, %ebp
-; AVX2-NEXT:    subl $-128, %ebp
-; AVX2-NEXT:    movq %r15, %rdi
-; AVX2-NEXT:    orq %rbx, %rdi
-; AVX2-NEXT:    cmovnel %esi, %ebp
-; AVX2-NEXT:    addl $256, %ebp # imm = 0x100
-; AVX2-NEXT:    movq %r10, %rdi
-; AVX2-NEXT:    orq %r12, %rdi
-; AVX2-NEXT:    movq %rax, %rsi
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rdi, %rsi
-; AVX2-NEXT:    cmovnel %ecx, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX2-NEXT:    cmovnel %eax, %r14d
+; AVX2-NEXT:    subl $-128, %r14d
+; AVX2-NEXT:    movq %r11, %rax
+; AVX2-NEXT:    orq %rbx, %rax
+; AVX2-NEXT:    cmovnel %r15d, %r14d
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rdi, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r12, %rcx
-; AVX2-NEXT:    testq %r12, %r12
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r11, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    xorl %esi, %esi
-; AVX2-NEXT:    lzcntq %r14, %rsi
-; AVX2-NEXT:    testq %r14, %r14
-; AVX2-NEXT:    cmovnel %esi, %ecx
-; AVX2-NEXT:    subl $-128, %ecx
-; AVX2-NEXT:    movq %rdi, %rsi
-; AVX2-NEXT:    orq %r12, %rsi
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    movq %rdx, %rdi
-; AVX2-NEXT:    lzcntq %rdx, %rdx
-; AVX2-NEXT:    addl $64, %edx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    lzcntq %rcx, %rax
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    lzcntq %rdx, %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %rcx, %rcx
+; AVX2-NEXT:    cmovnel %eax, %r15d
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    lzcntq %rsi, %r12
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %r10, %rax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %eax, %edx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX2-NEXT:    lzcntq %rax, %rax
+; AVX2-NEXT:    lzcntq %rdi, %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX2-NEXT:    lzcntq %rsi, %r8
 ; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    cmovnel %r12d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %r10, %rdi
-; AVX2-NEXT:    cmovnel %edx, %eax
-; AVX2-NEXT:    orq %r12, %r14
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r14, %r11
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r9
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r15d, %eax
 ; AVX2-NEXT:    orq %rbx, %r9
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    orq %r15, %r13
+; AVX2-NEXT:    orq %r11, %r8
+; AVX2-NEXT:    addl $256, %eax # imm = 0x100
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r14d, %eax
 ; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r9, %r13
-; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %r10d, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
 ; AVX2-NEXT:    popq %r13
 ; AVX2-NEXT:    popq %r14
 ; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: test_ctlz_undef_i1024:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT:    vmovq %rdi, %xmm0
-; AVX512F-NEXT:    vmovq %rsi, %xmm1
-; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512F-NEXT:    vmovq %rdx, %xmm1
-; AVX512F-NEXT:    vmovq %rcx, %xmm2
+; AVX512F-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512F-NEXT:    vmovq %rdi, %xmm1
+; AVX512F-NEXT:    vmovq %rsi, %xmm2
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512F-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512F-NEXT:    vmovq %r8, %xmm2
-; AVX512F-NEXT:    vmovq %r9, %xmm3
+; AVX512F-NEXT:    vmovq %rdx, %xmm2
+; AVX512F-NEXT:    vmovq %rcx, %xmm3
 ; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512F-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512F-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512F-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vmovd %xmm0, %ecx
+; AVX512F-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512F-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512F-NEXT:    vmovq %r8, %xmm3
+; AVX512F-NEXT:    vmovq %r9, %xmm4
+; AVX512F-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512F-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm1, %ecx
 ; AVX512F-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512F-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512F-NEXT:    vmovd %xmm0, %eax
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512F-NEXT:    orq %r14, %r11
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512F-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512F-NEXT:    orq %rbx, %r10
-; AVX512F-NEXT:    orq %r11, %r10
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
+; AVX512F-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512F-NEXT:    vptest %ymm0, %ymm0
 ; AVX512F-NEXT:    cmovel %ecx, %eax
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r14
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: test_ctlz_undef_i1024:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    pushq %r14
-; AVX512POPCNT-NEXT:    pushq %rbx
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm0
-; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm1
-; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm1
-; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm2
+; AVX512POPCNT-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512POPCNT-NEXT:    vmovq %rdi, %xmm1
+; AVX512POPCNT-NEXT:    vmovq %rsi, %xmm2
 ; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512POPCNT-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512POPCNT-NEXT:    vmovq %r8, %xmm2
-; AVX512POPCNT-NEXT:    vmovq %r9, %xmm3
+; AVX512POPCNT-NEXT:    vmovq %rdx, %xmm2
+; AVX512POPCNT-NEXT:    vmovq %rcx, %xmm3
 ; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %ecx
+; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512POPCNT-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512POPCNT-NEXT:    vmovq %r8, %xmm3
+; AVX512POPCNT-NEXT:    vmovq %r9, %xmm4
+; AVX512POPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512POPCNT-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512POPCNT-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512POPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %ecx
 ; AVX512POPCNT-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512POPCNT-NEXT:    orq %r14, %r11
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512POPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512POPCNT-NEXT:    orq %rbx, %r10
-; AVX512POPCNT-NEXT:    orq %r11, %r10
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512POPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %eax
+; AVX512POPCNT-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vptest %ymm0, %ymm0
 ; AVX512POPCNT-NEXT:    cmovel %ecx, %eax
-; AVX512POPCNT-NEXT:    popq %rbx
-; AVX512POPCNT-NEXT:    popq %r14
 ; AVX512POPCNT-NEXT:    retq
 ;
 ; AVX512VL-LABEL: test_ctlz_undef_i1024:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT:    vmovq %rdi, %xmm0
-; AVX512VL-NEXT:    vmovq %rsi, %xmm1
-; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VL-NEXT:    vmovq %rdx, %xmm1
-; AVX512VL-NEXT:    vmovq %rcx, %xmm2
+; AVX512VL-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VL-NEXT:    vmovq %rdi, %xmm1
+; AVX512VL-NEXT:    vmovq %rsi, %xmm2
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VL-NEXT:    vmovq %r8, %xmm2
-; AVX512VL-NEXT:    vmovq %r9, %xmm3
+; AVX512VL-NEXT:    vmovq %rdx, %xmm2
+; AVX512VL-NEXT:    vmovq %rcx, %xmm3
 ; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VL-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VL-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm0, %ecx
+; AVX512VL-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VL-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VL-NEXT:    vmovq %r8, %xmm3
+; AVX512VL-NEXT:    vmovq %r9, %xmm4
+; AVX512VL-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VL-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VL-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm1, %ecx
 ; AVX512VL-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm0, %eax
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VL-NEXT:    orq %r14, %r11
-; AVX512VL-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512VL-NEXT:    orq %rbx, %r10
-; AVX512VL-NEXT:    orq %r11, %r10
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
+; AVX512VL-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VL-NEXT:    vptest %ymm0, %ymm0
 ; AVX512VL-NEXT:    cmovel %ecx, %eax
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r14
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLPOPCNT-LABEL: test_ctlz_undef_i1024:
 ; AVX512VLPOPCNT:       # %bb.0:
-; AVX512VLPOPCNT-NEXT:    pushq %r14
-; AVX512VLPOPCNT-NEXT:    pushq %rbx
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT:    vmovq %rdi, %xmm0
-; AVX512VLPOPCNT-NEXT:    vmovq %rsi, %xmm1
-; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm0 = xmm1[0],xmm0[0]
-; AVX512VLPOPCNT-NEXT:    vmovq %rdx, %xmm1
-; AVX512VLPOPCNT-NEXT:    vmovq %rcx, %xmm2
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 {{[0-9]+}}(%rsp), %zmm0
+; AVX512VLPOPCNT-NEXT:    vmovq %rdi, %xmm1
+; AVX512VLPOPCNT-NEXT:    vmovq %rsi, %xmm2
 ; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm1 = xmm2[0],xmm1[0]
-; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm0, %ymm1, %ymm0
-; AVX512VLPOPCNT-NEXT:    vpshufd {{.*#+}} xmm1 = mem[2,3,0,1]
-; AVX512VLPOPCNT-NEXT:    vmovq %r8, %xmm2
-; AVX512VLPOPCNT-NEXT:    vmovq %r9, %xmm3
+; AVX512VLPOPCNT-NEXT:    vmovq %rdx, %xmm2
+; AVX512VLPOPCNT-NEXT:    vmovq %rcx, %xmm3
 ; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm2 = xmm3[0],xmm2[0]
-; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm2, %ymm1, %ymm1
-; AVX512VLPOPCNT-NEXT:    vinserti64x4 $1, %ymm0, %zmm1, %zmm0
-; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm2 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %ecx
+; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm1, %ymm2, %ymm1
+; AVX512VLPOPCNT-NEXT:    vpshufd {{.*#+}} xmm2 = mem[2,3,0,1]
+; AVX512VLPOPCNT-NEXT:    vmovq %r8, %xmm3
+; AVX512VLPOPCNT-NEXT:    vmovq %r9, %xmm4
+; AVX512VLPOPCNT-NEXT:    vpunpcklqdq {{.*#+}} xmm3 = xmm4[0],xmm3[0]
+; AVX512VLPOPCNT-NEXT:    vinserti128 $1, %xmm3, %ymm2, %ymm2
+; AVX512VLPOPCNT-NEXT:    vinserti64x4 $1, %ymm1, %zmm2, %zmm1
+; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %ecx
 ; AVX512VLPOPCNT-NEXT:    addl $512, %ecx # imm = 0x200
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT:    vpermq {{[0-9]+}}(%rsp), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r14
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; AVX512VLPOPCNT-NEXT:    orq %r14, %r11
-; AVX512VLPOPCNT-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX512VLPOPCNT-NEXT:    orq %rbx, %r10
-; AVX512VLPOPCNT-NEXT:    orq %r11, %r10
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm1
+; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %eax
+; AVX512VLPOPCNT-NEXT:    vpor {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    vptest %ymm0, %ymm0
 ; AVX512VLPOPCNT-NEXT:    cmovel %ecx, %eax
-; AVX512VLPOPCNT-NEXT:    popq %rbx
-; AVX512VLPOPCNT-NEXT:    popq %r14
 ; AVX512VLPOPCNT-NEXT:    vzeroupper
 ; AVX512VLPOPCNT-NEXT:    retq
   %cnt = call i1024 @llvm.ctlz.i1024(i1024 %a0, i1 -1)
@@ -4448,386 +4144,298 @@ define i32 @test_ctlz_undef_i1024(i1024 %a0) nounwind {
 define i32 @load_ctlz_undef_i1024(ptr %p0) nounwind {
 ; SSE-LABEL: load_ctlz_undef_i1024:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 40(%rdi), %rbp
-; SSE-NEXT:    movq 64(%rdi), %rbx
-; SSE-NEXT:    movq 72(%rdi), %r11
-; SSE-NEXT:    movq 80(%rdi), %r12
-; SSE-NEXT:    movq 88(%rdi), %r14
-; SSE-NEXT:    movq 96(%rdi), %r13
-; SSE-NEXT:    movq 104(%rdi), %r9
-; SSE-NEXT:    movq 112(%rdi), %r10
-; SSE-NEXT:    movq 120(%rdi), %r8
-; SSE-NEXT:    bsrq %r8, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r10, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    bsrq %r9, %rdx
-; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    bsrq %r13, %rax
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    orl $64, %eax
+; SSE-NEXT:    movq 8(%rdi), %rcx
+; SSE-NEXT:    movq 72(%rdi), %rax
+; SSE-NEXT:    movq 104(%rdi), %r8
+; SSE-NEXT:    movq 112(%rdi), %rdx
+; SSE-NEXT:    movq 120(%rdi), %r9
+; SSE-NEXT:    bsrq %r9, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rdx, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
 ; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %r10, %rdx
-; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    orq %r8, %rdx
-; SSE-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    bsrq %r14, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    movq %r12, %rsi
-; SSE-NEXT:    movq %r12, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    bsrq %r12, %rdx
+; SSE-NEXT:    cmovnel %esi, %r11d
+; SSE-NEXT:    bsrq %r8, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq 96(%rdi), %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    movq 80(%rdi), %r8
+; SSE-NEXT:    cmovnel %r10d, %esi
+; SSE-NEXT:    movq 88(%rdi), %r10
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r9, %rdx
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    bsrq %r10, %rdx
 ; SSE-NEXT:    xorl $63, %edx
-; SSE-NEXT:    orl $64, %edx
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %ecx, %edx
-; SSE-NEXT:    bsrq %r11, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    bsrq %rbx, %r15
-; SSE-NEXT:    xorl $63, %r15d
-; SSE-NEXT:    orl $64, %r15d
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %ecx, %r15d
-; SSE-NEXT:    movq 48(%rdi), %r12
-; SSE-NEXT:    subl $-128, %r15d
-; SSE-NEXT:    movq %rsi, %rcx
-; SSE-NEXT:    orq %r14, %rcx
-; SSE-NEXT:    cmovnel %edx, %r15d
-; SSE-NEXT:    addl $256, %r15d # imm = 0x100
-; SSE-NEXT:    movq %r9, %rcx
-; SSE-NEXT:    orq %r8, %rcx
-; SSE-NEXT:    movq %r13, %rdx
-; SSE-NEXT:    orq %r10, %rdx
-; SSE-NEXT:    orq %rcx, %rdx
-; SSE-NEXT:    movq 56(%rdi), %r13
-; SSE-NEXT:    cmovnel %eax, %r15d
-; SSE-NEXT:    bsrq %r13, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    bsrq %r12, %rdx
+; SSE-NEXT:    bsrq %r8, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %edx, %r11d
+; SSE-NEXT:    bsrq %rax, %r9
+; SSE-NEXT:    xorl $63, %r9d
+; SSE-NEXT:    bsrq 64(%rdi), %rdx
 ; SSE-NEXT:    xorl $63, %edx
 ; SSE-NEXT:    orl $64, %edx
-; SSE-NEXT:    testq %r13, %r13
-; SSE-NEXT:    cmovnel %eax, %edx
-; SSE-NEXT:    movq %rbp, %r10
-; SSE-NEXT:    bsrq %rbp, %rax
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %r9d, %edx
+; SSE-NEXT:    movq 40(%rdi), %r9
+; SSE-NEXT:    movq 48(%rdi), %rax
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    orq %r10, %r8
+; SSE-NEXT:    movq 56(%rdi), %r8
+; SSE-NEXT:    movdqa 112(%rdi), %xmm0
+; SSE-NEXT:    cmovnel %r11d, %edx
+; SSE-NEXT:    movdqa 96(%rdi), %xmm1
+; SSE-NEXT:    addl $256, %edx # imm = 0x100
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %edx
+; SSE-NEXT:    bsrq %r8, %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    bsrq %rax, %r11
+; SSE-NEXT:    xorl $63, %r11d
+; SSE-NEXT:    orl $64, %r11d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %esi, %r11d
+; SSE-NEXT:    bsrq %r9, %r10
+; SSE-NEXT:    xorl $63, %r10d
+; SSE-NEXT:    bsrq 32(%rdi), %rsi
+; SSE-NEXT:    xorl $63, %esi
+; SSE-NEXT:    orl $64, %esi
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    movq 16(%rdi), %r9
+; SSE-NEXT:    cmovnel %r10d, %esi
+; SSE-NEXT:    movq 24(%rdi), %r10
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    bsrq %r10, %rax
 ; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    movq 32(%rdi), %r8
-; SSE-NEXT:    bsrq %r8, %rbp
-; SSE-NEXT:    xorl $63, %ebp
-; SSE-NEXT:    orl $64, %ebp
+; SSE-NEXT:    bsrq %r9, %r8
+; SSE-NEXT:    xorl $63, %r8d
+; SSE-NEXT:    orl $64, %r8d
 ; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %eax, %ebp
-; SSE-NEXT:    subl $-128, %ebp
-; SSE-NEXT:    movq %r12, %rax
-; SSE-NEXT:    orq %r13, %rax
-; SSE-NEXT:    cmovnel %edx, %ebp
-; SSE-NEXT:    movq 24(%rdi), %r9
-; SSE-NEXT:    bsrq %r9, %rax
-; SSE-NEXT:    xorl $63, %eax
-; SSE-NEXT:    movq 16(%rdi), %rsi
-; SSE-NEXT:    bsrq %rsi, %rcx
-; SSE-NEXT:    xorl $63, %ecx
-; SSE-NEXT:    orl $64, %ecx
-; SSE-NEXT:    testq %r9, %r9
-; SSE-NEXT:    cmovnel %eax, %ecx
-; SSE-NEXT:    movq 8(%rdi), %rdx
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    bsrq %rcx, %r11
+; SSE-NEXT:    xorl $63, %r11d
 ; SSE-NEXT:    bsrq (%rdi), %rax
-; SSE-NEXT:    bsrq %rdx, %rdi
-; SSE-NEXT:    xorl $63, %edi
 ; SSE-NEXT:    xorl $63, %eax
 ; SSE-NEXT:    orl $64, %eax
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %edi, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r11d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %rsi
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq %r13, %r10
-; SSE-NEXT:    orq %r12, %r8
+; SSE-NEXT:    orq %r10, %r9
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    movdqa 32(%rdi), %xmm2
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r10, %r8
-; SSE-NEXT:    cmovnel %ebp, %eax
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT:    orq %r14, %r11
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; SSE-NEXT:    orq %rcx, %rbx
+; SSE-NEXT:    por 48(%rdi), %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    por 80(%rdi), %xmm0
+; SSE-NEXT:    por 64(%rdi), %xmm1
 ; SSE-NEXT:    addl $512, %eax # imm = 0x200
-; SSE-NEXT:    orq %r11, %rbx
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_ctlz_undef_i1024:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 48(%rdi), %r9
-; AVX2-NEXT:    movq 56(%rdi), %rbp
-; AVX2-NEXT:    movq 64(%rdi), %r11
-; AVX2-NEXT:    movq 72(%rdi), %r10
-; AVX2-NEXT:    movq 80(%rdi), %r14
-; AVX2-NEXT:    movq 88(%rdi), %rbx
-; AVX2-NEXT:    movq 96(%rdi), %rdx
-; AVX2-NEXT:    movq 104(%rdi), %r8
-; AVX2-NEXT:    movq 112(%rdi), %rsi
-; AVX2-NEXT:    movq 120(%rdi), %r15
-; AVX2-NEXT:    lzcntq %r15, %rax
-; AVX2-NEXT:    lzcntq %rsi, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    testq %r15, %r15
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    lzcntq %r8, %r12
+; AVX2-NEXT:    movq 16(%rdi), %rcx
+; AVX2-NEXT:    movq 72(%rdi), %rsi
+; AVX2-NEXT:    movq 104(%rdi), %rdx
+; AVX2-NEXT:    movq 112(%rdi), %r8
+; AVX2-NEXT:    movq 120(%rdi), %r10
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r8, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    lzcntq %rdx, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rdx, %rax
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    lzcntq 96(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    movq 80(%rdi), %r9
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    movq 88(%rdi), %r11
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    movq %rsi, %r12
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    orq %r15, %r12
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %rbx, %rcx
-; AVX2-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    lzcntq %r14, %r13
-; AVX2-NEXT:    addl $64, %r13d
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %ecx, %r13d
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r10, %rcx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    lzcntq %r11, %r12
-; AVX2-NEXT:    addl $64, %r12d
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %ecx, %r12d
-; AVX2-NEXT:    subl $-128, %r12d
-; AVX2-NEXT:    movq %r14, %rcx
-; AVX2-NEXT:    orq %rbx, %rcx
-; AVX2-NEXT:    cmovnel %r13d, %r12d
-; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
-; AVX2-NEXT:    movq %r8, %rcx
-; AVX2-NEXT:    orq %r15, %rcx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %r12d
-; AVX2-NEXT:    movq %rbp, %r14
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %rbp, %rcx
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    orq %r10, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq %r11, %rdx
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    lzcntq %r9, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %edx, %ebx
+; AVX2-NEXT:    xorl %edx, %edx
+; AVX2-NEXT:    lzcntq 64(%rdi), %rdx
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rsi, %r8
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    movq 40(%rdi), %r10
+; AVX2-NEXT:    cmovnel %r8d, %edx
+; AVX2-NEXT:    movq 48(%rdi), %r8
+; AVX2-NEXT:    subl $-128, %edx
+; AVX2-NEXT:    orq %r11, %r9
+; AVX2-NEXT:    movq 56(%rdi), %r9
+; AVX2-NEXT:    cmovnel %ebx, %edx
+; AVX2-NEXT:    vmovdqu 96(%rdi), %ymm0
+; AVX2-NEXT:    addl $256, %edx # imm = 0x100
+; AVX2-NEXT:    vpor 112(%rdi), %xmm0, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %eax, %edx
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq %r9, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %rbp, %rbp
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    movq 32(%rdi), %r13
-; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    lzcntq %r13, %rbp
-; AVX2-NEXT:    addl $64, %ebp
-; AVX2-NEXT:    movq 40(%rdi), %r8
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    lzcntq %r8, %rdx
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %edx, %ebp
-; AVX2-NEXT:    subl $-128, %ebp
-; AVX2-NEXT:    movq %r9, %rdx
-; AVX2-NEXT:    orq %r14, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ebp
-; AVX2-NEXT:    movq 16(%rdi), %r9
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    lzcntq %r9, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    movq 24(%rdi), %rdx
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    lzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    lzcntq 32(%rdi), %rsi
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    lzcntq %rdx, %rax
-; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    movq 8(%rdi), %rsi
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    movq 24(%rdi), %r10
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    lzcntq %r10, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    lzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 8(%rdi), %r9
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    lzcntq %r9, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    lzcntq (%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    lzcntq %rsi, %rdi
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r11d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %rdx, %r9
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq %r14, %r8
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Folded Reload
+; AVX2-NEXT:    orq %r10, %rcx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa 32(%rdi), %xmm1
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r8, %r13
-; AVX2-NEXT:    cmovnel %ebp, %eax
-; AVX2-NEXT:    orq %r15, %rbx
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbx, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rcx, %r11
+; AVX2-NEXT:    vpor 48(%rdi), %xmm1, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    vpor 64(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r10, %r11
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %edx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_ctlz_undef_i1024:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq 80(%rdi), %rsi
-; AVX512F-NEXT:    movq 64(%rdi), %rcx
-; AVX512F-NEXT:    movq 72(%rdi), %rdx
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512F-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512F-NEXT:    movq 88(%rdi), %r8
+; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512F-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512F-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512F-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512F-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm2, %ecx
+; AVX512F-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512F-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512F-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512F-NEXT:    vmovd %xmm1, %r9d
-; AVX512F-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512F-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vmovd %xmm0, %eax
-; AVX512F-NEXT:    orq 120(%rdi), %r8
+; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512F-NEXT:    orq 104(%rdi), %rdx
-; AVX512F-NEXT:    orq %r8, %rdx
-; AVX512F-NEXT:    orq 112(%rdi), %rsi
-; AVX512F-NEXT:    orq 96(%rdi), %rcx
-; AVX512F-NEXT:    orq %rsi, %rcx
-; AVX512F-NEXT:    orq %rdx, %rcx
-; AVX512F-NEXT:    cmovnel %r9d, %eax
+; AVX512F-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vptest %ymm0, %ymm0
+; AVX512F-NEXT:    cmovnel %ecx, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: load_ctlz_undef_i1024:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    movq 80(%rdi), %rsi
-; AVX512POPCNT-NEXT:    movq 64(%rdi), %rcx
-; AVX512POPCNT-NEXT:    movq 72(%rdi), %rdx
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512POPCNT-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    movq 88(%rdi), %r8
+; AVX512POPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512POPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512POPCNT-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm2, %ecx
+; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512POPCNT-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512POPCNT-NEXT:    vmovd %xmm1, %r9d
-; AVX512POPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
-; AVX512POPCNT-NEXT:    orq 120(%rdi), %r8
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512POPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT:    orq 104(%rdi), %rdx
-; AVX512POPCNT-NEXT:    orq %r8, %rdx
-; AVX512POPCNT-NEXT:    orq 112(%rdi), %rsi
-; AVX512POPCNT-NEXT:    orq 96(%rdi), %rcx
-; AVX512POPCNT-NEXT:    orq %rsi, %rcx
-; AVX512POPCNT-NEXT:    orq %rdx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %r9d, %eax
+; AVX512POPCNT-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512POPCNT-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_ctlz_undef_i1024:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    movq 80(%rdi), %rsi
-; AVX512VL-NEXT:    movq 64(%rdi), %rcx
-; AVX512VL-NEXT:    movq 72(%rdi), %rdx
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VL-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512VL-NEXT:    movq 88(%rdi), %r8
+; AVX512VL-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VL-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512VL-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VL-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VL-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm2, %ecx
+; AVX512VL-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VL-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VL-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm1, %r9d
-; AVX512VL-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512VL-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VL-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
 ; AVX512VL-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT:    orq 120(%rdi), %r8
-; AVX512VL-NEXT:    orq 104(%rdi), %rdx
-; AVX512VL-NEXT:    orq 112(%rdi), %rsi
-; AVX512VL-NEXT:    orq %r8, %rdx
-; AVX512VL-NEXT:    orq 96(%rdi), %rcx
-; AVX512VL-NEXT:    orq %rsi, %rcx
-; AVX512VL-NEXT:    orq %rdx, %rcx
-; AVX512VL-NEXT:    cmovnel %r9d, %eax
+; AVX512VL-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT:    vptest %ymm0, %ymm0
+; AVX512VL-NEXT:    cmovnel %ecx, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLPOPCNT-LABEL: load_ctlz_undef_i1024:
 ; AVX512VLPOPCNT:       # %bb.0:
-; AVX512VLPOPCNT-NEXT:    movq 80(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    movq 64(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    movq 72(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm0 = [7,6,5,4,3,2,1,0]
-; AVX512VLPOPCNT-NEXT:    vpermq 64(%rdi), %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    movq 88(%rdi), %r8
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm1 = [7,6,5,4,3,2,1,0]
+; AVX512VLPOPCNT-NEXT:    vpermq %zmm0, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm2, %zmm3
+; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm2, %zmm2, %k1
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm3, %zmm2 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm2, %ecx
+; AVX512VLPOPCNT-NEXT:    vpermq (%rdi), %zmm1, %zmm1
 ; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm1, %zmm2
-; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm3 = [0,64,128,192,256,320,384,448]
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
 ; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
 ; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %r9d
-; AVX512VLPOPCNT-NEXT:    vpermq (%rdi), %zmm0, %zmm0
-; AVX512VLPOPCNT-NEXT:    vplzcntq %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm3, %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLPOPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT:    orq 120(%rdi), %r8
-; AVX512VLPOPCNT-NEXT:    orq 104(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    orq 112(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    orq %r8, %rdx
-; AVX512VLPOPCNT-NEXT:    orq 96(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    orq %rsi, %rcx
-; AVX512VLPOPCNT-NEXT:    orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT:    cmovnel %r9d, %eax
+; AVX512VLPOPCNT-NEXT:    vpor 96(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512VLPOPCNT-NEXT:    vzeroupper
 ; AVX512VLPOPCNT-NEXT:    retq
   %a0 = load i1024, ptr %p0
@@ -5452,109 +5060,92 @@ define i32 @test_cttz_i512(i512 %a0) nounwind {
 define i32 @load_cttz_i512(ptr %p0) nounwind {
 ; SSE-LABEL: load_cttz_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 48(%rdi), %r10
-; SSE-NEXT:    movq 40(%rdi), %r9
-; SSE-NEXT:    movq 24(%rdi), %r8
-; SSE-NEXT:    movq 16(%rdi), %rdx
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %rsi
-; SSE-NEXT:    rep bsfq %rcx, %rax
-; SSE-NEXT:    rep bsfq %rsi, %rbx
-; SSE-NEXT:    addl $64, %ebx
-; SSE-NEXT:    testq %rcx, %rcx
-; SSE-NEXT:    cmovnel %eax, %ebx
-; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    rep bsfq %r8, %r11
-; SSE-NEXT:    addl $64, %r11d
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %r11d
-; SSE-NEXT:    movq 32(%rdi), %r14
-; SSE-NEXT:    subl $-128, %r11d
-; SSE-NEXT:    movq %rcx, %rax
-; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    cmovnel %ebx, %r11d
-; SSE-NEXT:    rep bsfq %r14, %rax
-; SSE-NEXT:    rep bsfq %r9, %rbx
-; SSE-NEXT:    addl $64, %ebx
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %eax, %ebx
-; SSE-NEXT:    rep bsfq %r10, %r15
+; SSE-NEXT:    movdqa (%rdi), %xmm0
+; SSE-NEXT:    movq 48(%rdi), %rdx
+; SSE-NEXT:    movq 40(%rdi), %rcx
+; SSE-NEXT:    movq (%rdi), %rax
+; SSE-NEXT:    movq 8(%rdi), %r8
+; SSE-NEXT:    rep bsfq %rax, %rsi
+; SSE-NEXT:    rep bsfq %r8, %r9
+; SSE-NEXT:    addl $64, %r9d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %esi, %r9d
+; SSE-NEXT:    movq 16(%rdi), %r10
+; SSE-NEXT:    rep bsfq %r10, %r11
+; SSE-NEXT:    rep bsfq 24(%rdi), %rsi
+; SSE-NEXT:    addl $64, %esi
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    cmovnel %r9d, %esi
+; SSE-NEXT:    movq 32(%rdi), %r8
+; SSE-NEXT:    rep bsfq %r8, %rax
+; SSE-NEXT:    rep bsfq %rcx, %r9
+; SSE-NEXT:    addl $64, %r9d
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %eax, %r9d
+; SSE-NEXT:    rep bsfq %rdx, %r10
 ; SSE-NEXT:    movl $64, %eax
 ; SSE-NEXT:    rep bsfq 56(%rdi), %rax
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r10d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %r14
-; SSE-NEXT:    cmovnel %ebx, %eax
+; SSE-NEXT:    orq %rcx, %r8
+; SSE-NEXT:    cmovnel %r9d, %eax
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r8, %rsi
-; SSE-NEXT:    orq %rdx, %rcx
-; SSE-NEXT:    orq %rsi, %rcx
-; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    por 16(%rdi), %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %esi, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
-; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_cttz_i512:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 48(%rdi), %r10
-; AVX2-NEXT:    movq 40(%rdi), %r9
-; AVX2-NEXT:    movq 24(%rdi), %r8
-; AVX2-NEXT:    movq 16(%rdi), %rdx
-; AVX2-NEXT:    movq (%rdi), %rcx
-; AVX2-NEXT:    movq 8(%rdi), %rsi
-; AVX2-NEXT:    tzcntq %rcx, %rax
-; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    tzcntq %rsi, %rbx
-; AVX2-NEXT:    addl $64, %ebx
-; AVX2-NEXT:    testq %rcx, %rcx
-; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    movq 40(%rdi), %rcx
+; AVX2-NEXT:    movq 32(%rdi), %rdx
+; AVX2-NEXT:    movq 16(%rdi), %rax
+; AVX2-NEXT:    movq (%rdi), %r8
+; AVX2-NEXT:    movq 8(%rdi), %r9
+; AVX2-NEXT:    tzcntq %r8, %rsi
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %esi, %r10d
+; AVX2-NEXT:    tzcntq %rax, %r11
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    tzcntq 24(%rdi), %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq %rdx, %rax
-; AVX2-NEXT:    tzcntq %r8, %r11
-; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
 ; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %r11d
-; AVX2-NEXT:    subl $-128, %r11d
-; AVX2-NEXT:    movq %rcx, %rax
-; AVX2-NEXT:    orq %rsi, %rax
-; AVX2-NEXT:    cmovnel %ebx, %r11d
-; AVX2-NEXT:    movq 32(%rdi), %rbx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rbx, %rax
-; AVX2-NEXT:    xorl %r14d, %r14d
-; AVX2-NEXT:    tzcntq %r9, %r14
-; AVX2-NEXT:    addl $64, %r14d
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %eax, %r14d
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %r10, %r15
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 48(%rdi), %r9
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r9, %r10
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq 56(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %r9, %rbx
-; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    orq %rsi, %rcx
-; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    vpor 16(%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    cmovnel %esi, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_cttz_i512:
@@ -5786,97 +5377,91 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; SSE-NEXT:    pushq %r13
 ; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %r9, %r13
-; SSE-NEXT:    movq %r8, %r14
-; SSE-NEXT:    movq %rcx, %rbx
-; SSE-NEXT:    movq %rdx, %r10
-; SSE-NEXT:    movq %rsi, %r9
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE-NEXT:    movq %r9, %r10
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    rep bsfq %rdi, %r11
+; SSE-NEXT:    rep bsfq %rsi, %rbx
+; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    testq %rdi, %rdi
+; SSE-NEXT:    cmovnel %r11d, %ebx
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT:    rep bsfq %rdi, %rax
-; SSE-NEXT:    rep bsfq %r9, %r15
+; SSE-NEXT:    rep bsfq %rdx, %r12
+; SSE-NEXT:    rep bsfq %rcx, %r15
 ; SSE-NEXT:    addl $64, %r15d
-; SSE-NEXT:    testq %rdi, %rdi
-; SSE-NEXT:    cmovnel %eax, %r15d
-; SSE-NEXT:    rep bsfq %r10, %r12
-; SSE-NEXT:    rep bsfq %rcx, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %r12d, %eax
-; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r12d, %r15d
+; SSE-NEXT:    subl $-128, %r15d
 ; SSE-NEXT:    movq %rdi, %r12
-; SSE-NEXT:    orq %r9, %r12
-; SSE-NEXT:    cmovnel %r15d, %eax
-; SSE-NEXT:    rep bsfq %r8, %r15
-; SSE-NEXT:    movq %r13, %rcx
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    rep bsfq %r13, %r13
+; SSE-NEXT:    orq %rsi, %r12
+; SSE-NEXT:    cmovnel %ebx, %r15d
+; SSE-NEXT:    rep bsfq %r8, %rbx
+; SSE-NEXT:    rep bsfq %r10, %r13
 ; SSE-NEXT:    addl $64, %r13d
 ; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %ebx, %r13d
+; SSE-NEXT:    rep bsfq %r9, %r12
+; SSE-NEXT:    rep bsfq %r11, %rbx
+; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r12d, %ebx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    subl $-128, %ebx
+; SSE-NEXT:    movq %r8, %rbp
+; SSE-NEXT:    orq %r10, %rbp
+; SSE-NEXT:    cmovnel %r13d, %ebx
+; SSE-NEXT:    addl $256, %ebx # imm = 0x100
+; SSE-NEXT:    movq %rsi, %r13
+; SSE-NEXT:    orq %rcx, %r13
+; SSE-NEXT:    movq %rdi, %rbp
+; SSE-NEXT:    orq %rdx, %rbp
+; SSE-NEXT:    orq %r13, %rbp
+; SSE-NEXT:    cmovnel %r15d, %ebx
+; SSE-NEXT:    rep bsfq %rax, %r15
+; SSE-NEXT:    rep bsfq %r12, %r13
+; SSE-NEXT:    addl $64, %r13d
+; SSE-NEXT:    testq %rax, %rax
 ; SSE-NEXT:    cmovnel %r15d, %r13d
-; SSE-NEXT:    rep bsfq %rdx, %r12
+; SSE-NEXT:    rep bsfq %r14, %rbp
 ; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r15
 ; SSE-NEXT:    addl $64, %r15d
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %r12d, %r15d
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    testq %r14, %r14
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; SSE-NEXT:    cmovnel %ebp, %r15d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; SSE-NEXT:    subl $-128, %r15d
-; SSE-NEXT:    movq %r8, %rbp
-; SSE-NEXT:    orq %rcx, %rbp
+; SSE-NEXT:    orq %r12, %rax
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
 ; SSE-NEXT:    cmovnel %r13d, %r15d
-; SSE-NEXT:    addl $256, %r15d # imm = 0x100
-; SSE-NEXT:    movq %r9, %r13
-; SSE-NEXT:    orq %rbx, %r13
-; SSE-NEXT:    movq %rdi, %rbp
-; SSE-NEXT:    orq %r10, %rbp
-; SSE-NEXT:    orq %r13, %rbp
-; SSE-NEXT:    cmovnel %eax, %r15d
-; SSE-NEXT:    rep bsfq %r11, %r13
-; SSE-NEXT:    rep bsfq %r12, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %r13d, %eax
-; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r13
+; SSE-NEXT:    rep bsfq %r14, %rax
+; SSE-NEXT:    rep bsfq %r12, %r13
 ; SSE-NEXT:    addl $64, %r13d
-; SSE-NEXT:    rep bsfq %rsi, %rcx
-; SSE-NEXT:    testq %rsi, %rsi
-; SSE-NEXT:    cmovnel %ecx, %r13d
-; SSE-NEXT:    subl $-128, %r13d
-; SSE-NEXT:    movq %r11, %rcx
-; SSE-NEXT:    orq %r12, %rcx
+; SSE-NEXT:    testq %r14, %r14
 ; SSE-NEXT:    cmovnel %eax, %r13d
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE-NEXT:    rep bsfq %rbp, %rcx
-; SSE-NEXT:    addl $64, %ecx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %ecx
 ; SSE-NEXT:    movl $64, %eax
 ; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT:    rep bsfq %r8, %rsi
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    rep bsfq %r9, %rbp
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %ebp, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %rbp, %rdx
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r12
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r12, %r11
+; SSE-NEXT:    orq %r12, %r14
 ; SSE-NEXT:    cmovnel %r13d, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbx, %r9
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    orq %r14, %rdi
-; SSE-NEXT:    orq %r10, %rdi
-; SSE-NEXT:    addl $512, %eax # imm = 0x200
-; SSE-NEXT:    orq %r9, %rdi
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    addl $256, %eax # imm = 0x100
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
 ; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    orq %r11, %rcx
+; SSE-NEXT:    orq %r10, %rsi
+; SSE-NEXT:    orq %rcx, %rsi
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rdx
+; SSE-NEXT:    orq %r8, %rdi
+; SSE-NEXT:    orq %rdx, %rdi
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %ebx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r12
@@ -5894,111 +5479,108 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 ; AVX2-NEXT:    pushq %r13
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq %r9, %rbx
-; AVX2-NEXT:    movq %r8, %r14
-; AVX2-NEXT:    movq %rcx, %r11
-; AVX2-NEXT:    movq %rdx, %r10
-; AVX2-NEXT:    movq %rsi, %r9
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT:    movq %r9, %r10
+; AVX2-NEXT:    movq %r8, %r9
+; AVX2-NEXT:    movq %rcx, %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    tzcntq %rdi, %rax
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %r9, %r15
-; AVX2-NEXT:    addl $64, %r15d
-; AVX2-NEXT:    testq %rdi, %rdi
-; AVX2-NEXT:    cmovnel %eax, %r15d
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %r10, %r12
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r11, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %r12d, %eax
-; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    movq %rdi, %r12
-; AVX2-NEXT:    orq %r9, %r12
-; AVX2-NEXT:    cmovnel %r15d, %eax
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %r14, %r15
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rdi, %rbx
 ; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    tzcntq %rsi, %r12
 ; AVX2-NEXT:    addl $64, %r12d
-; AVX2-NEXT:    testq %r14, %r14
-; AVX2-NEXT:    cmovnel %r15d, %r12d
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %rcx, %r13
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %ebx, %r12d
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rdx, %rbx
 ; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %rdx, %r15
+; AVX2-NEXT:    tzcntq %r8, %r15
 ; AVX2-NEXT:    addl $64, %r15d
-; AVX2-NEXT:    testq %rcx, %rcx
-; AVX2-NEXT:    cmovnel %r13d, %r15d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %ebx, %r15d
 ; AVX2-NEXT:    subl $-128, %r15d
-; AVX2-NEXT:    movq %r14, %r13
-; AVX2-NEXT:    orq %rbx, %r13
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    orq %rsi, %rbx
 ; AVX2-NEXT:    cmovnel %r12d, %r15d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    addl $256, %r15d # imm = 0x100
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r10, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %ebx, %r12d
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %r11, %r13
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rcx, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %r13d, %ebx
+; AVX2-NEXT:    subl $-128, %ebx
 ; AVX2-NEXT:    movq %r9, %r13
-; AVX2-NEXT:    orq %r11, %r13
-; AVX2-NEXT:    movq %rdi, %rbp
-; AVX2-NEXT:    orq %r10, %rbp
-; AVX2-NEXT:    orq %r13, %rbp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    cmovnel %eax, %r15d
-; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    tzcntq %r12, %rbp
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r13, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r12, %r12
-; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    orq %r10, %r13
+; AVX2-NEXT:    cmovnel %r12d, %ebx
+; AVX2-NEXT:    addl $256, %ebx # imm = 0x100
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    orq %r8, %r12
+; AVX2-NEXT:    movq %rdi, %r13
+; AVX2-NEXT:    orq %rdx, %r13
+; AVX2-NEXT:    orq %r12, %r13
+; AVX2-NEXT:    cmovnel %r15d, %ebx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %rax, %r15
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %r12, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r15d, %r13d
 ; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    tzcntq %r8, %rbp
-; AVX2-NEXT:    addl $64, %ebp
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %rsi, %rcx
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %ecx, %ebp
-; AVX2-NEXT:    subl $-128, %ebp
-; AVX2-NEXT:    movq %r12, %rcx
-; AVX2-NEXT:    orq %r13, %rcx
-; AVX2-NEXT:    cmovnel %eax, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %rbx, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    tzcntq %r14, %rbp
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    cmovnel %ebp, %r15d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX2-NEXT:    subl $-128, %r15d
+; AVX2-NEXT:    orq %r12, %rax
+; AVX2-NEXT:    cmovnel %r13d, %r15d
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rdx, %rax
-; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    tzcntq %rbp, %rax
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r14, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %rbp, %rbp
+; AVX2-NEXT:    cmovnel %eax, %r12d
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    tzcntq %r8, %rsi
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %esi, %eax
-; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %rbx, %rdx
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %r13, %rcx
+; AVX2-NEXT:    testq %r13, %r13
 ; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r14, %rbp
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r13, %r12
-; AVX2-NEXT:    cmovnel %ebp, %eax
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    orq %r14, %rdi
-; AVX2-NEXT:    orq %r10, %rdi
-; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r9, %rdi
+; AVX2-NEXT:    vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT:    vptest %xmm0, %xmm0
 ; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    orq %r10, %rsi
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %r11, %rdx
+; AVX2-NEXT:    orq %r9, %rdi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    cmovnel %ebx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
@@ -6189,389 +5771,301 @@ define i32 @test_cttz_i1024(i1024 %a0) nounwind {
 define i32 @load_cttz_i1024(ptr %p0) nounwind {
 ; SSE-LABEL: load_cttz_i1024:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 88(%rdi), %r10
-; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 56(%rdi), %rcx
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 40(%rdi), %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 24(%rdi), %r9
-; SSE-NEXT:    movq 16(%rdi), %r15
-; SSE-NEXT:    movq (%rdi), %r8
-; SSE-NEXT:    movq 8(%rdi), %r11
-; SSE-NEXT:    rep bsfq %r8, %rax
+; SSE-NEXT:    movq 112(%rdi), %rcx
+; SSE-NEXT:    movq 48(%rdi), %rsi
+; SSE-NEXT:    movq (%rdi), %rdx
+; SSE-NEXT:    movq 8(%rdi), %r8
+; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    rep bsfq %r8, %r9
+; SSE-NEXT:    addl $64, %r9d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %eax, %r9d
+; SSE-NEXT:    movq 16(%rdi), %r10
+; SSE-NEXT:    rep bsfq %r10, %r11
+; SSE-NEXT:    rep bsfq 24(%rdi), %rax
+; SSE-NEXT:    addl $64, %eax
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    movq 40(%rdi), %r10
+; SSE-NEXT:    movq 32(%rdi), %r11
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %r8, %rdx
+; SSE-NEXT:    cmovnel %r9d, %eax
 ; SSE-NEXT:    rep bsfq %r11, %rdx
+; SSE-NEXT:    rep bsfq %r10, %rbx
+; SSE-NEXT:    addl $64, %ebx
+; SSE-NEXT:    testq %r11, %r11
+; SSE-NEXT:    cmovnel %edx, %ebx
+; SSE-NEXT:    rep bsfq %rsi, %r9
+; SSE-NEXT:    rep bsfq 56(%rdi), %rdx
 ; SSE-NEXT:    addl $64, %edx
-; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    movq 72(%rdi), %r8
+; SSE-NEXT:    cmovnel %r9d, %edx
+; SSE-NEXT:    movq 64(%rdi), %r9
+; SSE-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    orq %r10, %r11
+; SSE-NEXT:    cmovnel %ebx, %edx
+; SSE-NEXT:    addl $256, %edx # imm = 0x100
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
 ; SSE-NEXT:    cmovnel %eax, %edx
-; SSE-NEXT:    rep bsfq %r15, %rbx
 ; SSE-NEXT:    rep bsfq %r9, %rax
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r15, %r15
-; SSE-NEXT:    cmovnel %ebx, %eax
-; SSE-NEXT:    movq 32(%rdi), %rbx
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %r8, %r14
-; SSE-NEXT:    orq %r11, %r14
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    rep bsfq %rbx, %rdx
-; SSE-NEXT:    rep bsfq %rsi, %r12
-; SSE-NEXT:    addl $64, %r12d
+; SSE-NEXT:    rep bsfq %r8, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    movq 80(%rdi), %rax
+; SSE-NEXT:    rep bsfq %rax, %r10
+; SSE-NEXT:    rep bsfq 88(%rdi), %rsi
+; SSE-NEXT:    addl $64, %esi
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %r10d, %esi
+; SSE-NEXT:    movq 104(%rdi), %r10
+; SSE-NEXT:    movq 96(%rdi), %rbx
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r8, %r9
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    rep bsfq %rbx, %rax
+; SSE-NEXT:    rep bsfq %r10, %r8
+; SSE-NEXT:    addl $64, %r8d
 ; SSE-NEXT:    testq %rbx, %rbx
-; SSE-NEXT:    cmovnel %edx, %r12d
-; SSE-NEXT:    movq 48(%rdi), %r13
-; SSE-NEXT:    movq %r13, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    rep bsfq %r13, %rdx
-; SSE-NEXT:    rep bsfq %rcx, %r14
-; SSE-NEXT:    addl $64, %r14d
-; SSE-NEXT:    testq %r13, %r13
-; SSE-NEXT:    cmovnel %edx, %r14d
-; SSE-NEXT:    subl $-128, %r14d
-; SSE-NEXT:    movq %rbx, %rdx
-; SSE-NEXT:    orq %rsi, %rdx
-; SSE-NEXT:    cmovnel %r12d, %r14d
-; SSE-NEXT:    movq 72(%rdi), %r12
-; SSE-NEXT:    addl $256, %r14d # imm = 0x100
-; SSE-NEXT:    movq %r11, %rdx
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    movq %r8, %r13
-; SSE-NEXT:    orq %r15, %r13
-; SSE-NEXT:    orq %rdx, %r13
-; SSE-NEXT:    movq 64(%rdi), %r13
-; SSE-NEXT:    cmovnel %eax, %r14d
-; SSE-NEXT:    rep bsfq %r13, %rdx
-; SSE-NEXT:    rep bsfq %r12, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r13, %r13
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    rep bsfq %r10, %rbp
-; SSE-NEXT:    addl $64, %ebp
-; SSE-NEXT:    movq 80(%rdi), %r10
-; SSE-NEXT:    rep bsfq %r10, %rcx
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %ecx, %ebp
-; SSE-NEXT:    subl $-128, %ebp
-; SSE-NEXT:    movq %r13, %rcx
-; SSE-NEXT:    orq %r12, %rcx
-; SSE-NEXT:    cmovnel %eax, %ebp
-; SSE-NEXT:    movq 104(%rdi), %r9
-; SSE-NEXT:    rep bsfq %r9, %rcx
-; SSE-NEXT:    addl $64, %ecx
-; SSE-NEXT:    movq 96(%rdi), %rdx
-; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    rep bsfq %rcx, %r9
 ; SSE-NEXT:    movl $64, %eax
 ; SSE-NEXT:    rep bsfq 120(%rdi), %rax
-; SSE-NEXT:    movq 112(%rdi), %rdi
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    rep bsfq %rdi, %rsi
-; SSE-NEXT:    testq %rdi, %rdi
-; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    testq %rcx, %rcx
+; SSE-NEXT:    cmovnel %r9d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Folded Reload
-; SSE-NEXT:    orq %r10, %r13
+; SSE-NEXT:    orq %r10, %rbx
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    movdqa 64(%rdi), %xmm2
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r12, %r13
-; SSE-NEXT:    cmovnel %ebp, %eax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rcx, %r11
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbx, %r8
-; SSE-NEXT:    orq %r15, %r8
+; SSE-NEXT:    por 80(%rdi), %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    por 48(%rdi), %xmm0
+; SSE-NEXT:    por 32(%rdi), %xmm1
 ; SSE-NEXT:    addl $512, %eax # imm = 0x200
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_cttz_i1024:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 72(%rdi), %r14
-; AVX2-NEXT:    movq 64(%rdi), %r15
-; AVX2-NEXT:    movq 56(%rdi), %r9
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %rcx
-; AVX2-NEXT:    movq 40(%rdi), %r10
-; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 32(%rdi), %rsi
-; AVX2-NEXT:    movq 24(%rdi), %rbp
-; AVX2-NEXT:    movq 16(%rdi), %rbx
+; AVX2-NEXT:    movq 104(%rdi), %rcx
+; AVX2-NEXT:    movq 48(%rdi), %rsi
+; AVX2-NEXT:    movq 16(%rdi), %rdx
 ; AVX2-NEXT:    movq (%rdi), %r8
-; AVX2-NEXT:    movq 8(%rdi), %r11
+; AVX2-NEXT:    movq 8(%rdi), %r9
 ; AVX2-NEXT:    tzcntq %r8, %rax
-; AVX2-NEXT:    tzcntq %r11, %rdx
-; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    addl $64, %ebx
 ; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %eax, %edx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    tzcntq %rdx, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rbp, %rax
-; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    tzcntq 24(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    movq 40(%rdi), %r10
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    movq 32(%rdi), %r11
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    movq %r8, %r12
-; AVX2-NEXT:    orq %r11, %r12
-; AVX2-NEXT:    cmovnel %edx, %eax
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    tzcntq %rsi, %rdx
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %r10, %r13
-; AVX2-NEXT:    addl $64, %r13d
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    cmovnel %edx, %r13d
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
 ; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    tzcntq %rcx, %rdx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %r9, %r12
-; AVX2-NEXT:    addl $64, %r12d
-; AVX2-NEXT:    testq %rcx, %rcx
-; AVX2-NEXT:    cmovnel %edx, %r12d
-; AVX2-NEXT:    subl $-128, %r12d
-; AVX2-NEXT:    movq %rsi, %rdx
-; AVX2-NEXT:    orq %r10, %rdx
-; AVX2-NEXT:    cmovnel %r13d, %r12d
-; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
-; AVX2-NEXT:    movq %r11, %rdx
-; AVX2-NEXT:    orq %rbp, %rdx
-; AVX2-NEXT:    movq %r8, %r13
-; AVX2-NEXT:    orq %rbx, %r13
-; AVX2-NEXT:    orq %rdx, %r13
-; AVX2-NEXT:    cmovnel %eax, %r12d
+; AVX2-NEXT:    tzcntq %r11, %rdx
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r10, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %edx, %ebx
 ; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    tzcntq %r15, %rdx
+; AVX2-NEXT:    tzcntq 56(%rdi), %rdx
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rsi, %r8
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    movq 80(%rdi), %r9
+; AVX2-NEXT:    cmovnel %r8d, %edx
+; AVX2-NEXT:    movq 72(%rdi), %r8
+; AVX2-NEXT:    subl $-128, %edx
+; AVX2-NEXT:    orq %r10, %r11
+; AVX2-NEXT:    movq 64(%rdi), %r10
+; AVX2-NEXT:    cmovnel %ebx, %edx
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    addl $256, %edx # imm = 0x100
+; AVX2-NEXT:    vpor 16(%rdi), %xmm0, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %eax, %edx
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r14, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r15, %r15
-; AVX2-NEXT:    cmovnel %edx, %eax
-; AVX2-NEXT:    movq 88(%rdi), %rbp
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %rbp, %r13
-; AVX2-NEXT:    addl $64, %r13d
-; AVX2-NEXT:    movq 80(%rdi), %r10
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %r10, %rcx
+; AVX2-NEXT:    tzcntq %r10, %rax
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    tzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
 ; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %ecx, %r13d
-; AVX2-NEXT:    subl $-128, %r13d
-; AVX2-NEXT:    movq %r15, %rcx
-; AVX2-NEXT:    orq %r14, %rcx
-; AVX2-NEXT:    cmovnel %eax, %r13d
-; AVX2-NEXT:    movq 104(%rdi), %r9
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %r9, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    movq 96(%rdi), %rdx
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    tzcntq 88(%rdi), %rsi
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rdx, %rax
-; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    movq 112(%rdi), %rsi
+; AVX2-NEXT:    tzcntq %r9, %rax
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    movq 96(%rdi), %r9
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r8, %r10
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r9, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 112(%rdi), %r10
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    tzcntq %r10, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq 120(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    tzcntq %rsi, %rdi
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r11d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %r9, %rdx
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq %rbp, %r14
-; AVX2-NEXT:    orq %r10, %r15
+; AVX2-NEXT:    orq %rcx, %r9
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r14, %r15
-; AVX2-NEXT:    cmovnel %r13d, %eax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rcx, %r11
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbx, %r8
+; AVX2-NEXT:    vpor 80(%rdi), %xmm1, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %edx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_cttz_i1024:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512F-NEXT:    movq 16(%rdi), %rax
-; AVX512F-NEXT:    movq (%rdi), %rcx
-; AVX512F-NEXT:    movq 8(%rdi), %rdx
-; AVX512F-NEXT:    movq 24(%rdi), %rsi
-; AVX512F-NEXT:    orq 56(%rdi), %rsi
-; AVX512F-NEXT:    orq 40(%rdi), %rdx
-; AVX512F-NEXT:    orq 48(%rdi), %rax
-; AVX512F-NEXT:    orq 32(%rdi), %rcx
-; AVX512F-NEXT:    orq %rsi, %rdx
-; AVX512F-NEXT:    orq %rax, %rcx
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512F-NEXT:    vplzcntq %zmm3, %zmm3
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
 ; AVX512F-NEXT:    vpsubq %zmm3, %zmm4, %zmm3
-; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vmovd %xmm1, %esi
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
-; AVX512F-NEXT:    vpsubq %zmm1, %zmm4, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm3, %ecx
+; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT:    vplzcntq %zmm2, %zmm2
+; AVX512F-NEXT:    vpsubq %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512F-NEXT:    orq %rdx, %rcx
-; AVX512F-NEXT:    cmovnel %esi, %eax
+; AVX512F-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vptest %ymm0, %ymm0
+; AVX512F-NEXT:    cmovnel %ecx, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: load_cttz_i1024:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512POPCNT-NEXT:    movq 16(%rdi), %rax
-; AVX512POPCNT-NEXT:    movq (%rdi), %rcx
-; AVX512POPCNT-NEXT:    movq 8(%rdi), %rdx
-; AVX512POPCNT-NEXT:    movq 24(%rdi), %rsi
-; AVX512POPCNT-NEXT:    orq 56(%rdi), %rsi
-; AVX512POPCNT-NEXT:    orq 40(%rdi), %rdx
-; AVX512POPCNT-NEXT:    orq 48(%rdi), %rax
-; AVX512POPCNT-NEXT:    orq 32(%rdi), %rcx
-; AVX512POPCNT-NEXT:    orq %rsi, %rdx
-; AVX512POPCNT-NEXT:    orq %rax, %rcx
+; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512POPCNT-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512POPCNT-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512POPCNT-NEXT:    vpopcntq %zmm3, %zmm3
 ; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
 ; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
-; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT:    vmovd %xmm1, %esi
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm1, %zmm1
 ; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm3, %ecx
+; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vpopcntq %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512POPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT:    orq %rdx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %esi, %eax
+; AVX512POPCNT-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512POPCNT-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_cttz_i1024:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512VL-NEXT:    movq 16(%rdi), %rax
-; AVX512VL-NEXT:    movq (%rdi), %rcx
-; AVX512VL-NEXT:    movq 8(%rdi), %rdx
-; AVX512VL-NEXT:    movq 24(%rdi), %rsi
-; AVX512VL-NEXT:    orq 56(%rdi), %rsi
-; AVX512VL-NEXT:    orq 40(%rdi), %rdx
-; AVX512VL-NEXT:    orq %rsi, %rdx
-; AVX512VL-NEXT:    orq 48(%rdi), %rax
-; AVX512VL-NEXT:    orq 32(%rdi), %rcx
-; AVX512VL-NEXT:    orq %rax, %rcx
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VL-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512VL-NEXT:    vplzcntq %zmm3, %zmm3
 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
 ; AVX512VL-NEXT:    vpsubq %zmm3, %zmm4, %zmm3
-; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm1, %esi
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
-; AVX512VL-NEXT:    vpsubq %zmm1, %zmm4, %zmm1
 ; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm3, %ecx
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vplzcntq %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsubq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
 ; AVX512VL-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT:    orq %rdx, %rcx
-; AVX512VL-NEXT:    cmovnel %esi, %eax
+; AVX512VL-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT:    vptest %ymm0, %ymm0
+; AVX512VL-NEXT:    cmovnel %ecx, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLPOPCNT-LABEL: load_cttz_i1024:
 ; AVX512VLPOPCNT:       # %bb.0:
-; AVX512VLPOPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512VLPOPCNT-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512VLPOPCNT-NEXT:    movq 16(%rdi), %rax
-; AVX512VLPOPCNT-NEXT:    movq (%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    movq 8(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    movq 24(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    orq 56(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    orq 40(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    orq %rsi, %rdx
-; AVX512VLPOPCNT-NEXT:    orq 48(%rdi), %rax
-; AVX512VLPOPCNT-NEXT:    orq 32(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    orq %rax, %rcx
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512VLPOPCNT-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VLPOPCNT-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VLPOPCNT-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512VLPOPCNT-NEXT:    vpopcntq %zmm3, %zmm3
 ; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
 ; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %esi
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpopcntq %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm1, %zmm1
 ; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm0 = [512,512,512,512,512,512,512,512]
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm3, %ecx
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpopcntq %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT:    vpbroadcastq {{.*#+}} zmm1 = [512,512,512,512,512,512,512,512]
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLPOPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT:    orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT:    cmovnel %esi, %eax
+; AVX512VLPOPCNT-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512VLPOPCNT-NEXT:    vzeroupper
 ; AVX512VLPOPCNT-NEXT:    retq
   %a0 = load i1024, ptr %p0
@@ -7177,108 +6671,93 @@ define i32 @test_cttz_undef_i512(i512 %a0) nounwind {
 define i32 @load_cttz_undef_i512(ptr %p0) nounwind {
 ; SSE-LABEL: load_cttz_undef_i512:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 40(%rdi), %r9
-; SSE-NEXT:    movq 24(%rdi), %r8
-; SSE-NEXT:    movq 16(%rdi), %rdx
-; SSE-NEXT:    movq (%rdi), %rcx
-; SSE-NEXT:    movq 8(%rdi), %rsi
-; SSE-NEXT:    rep bsfq %rcx, %rax
-; SSE-NEXT:    rep bsfq %rsi, %r11
+; SSE-NEXT:    movdqa (%rdi), %xmm0
+; SSE-NEXT:    movq 48(%rdi), %rsi
+; SSE-NEXT:    movq 40(%rdi), %rcx
+; SSE-NEXT:    movq 32(%rdi), %rdx
+; SSE-NEXT:    movq 16(%rdi), %rax
+; SSE-NEXT:    movq (%rdi), %r9
+; SSE-NEXT:    movq 8(%rdi), %r10
+; SSE-NEXT:    rep bsfq %r9, %r8
+; SSE-NEXT:    rep bsfq %r10, %r11
 ; SSE-NEXT:    addl $64, %r11d
-; SSE-NEXT:    testq %rcx, %rcx
-; SSE-NEXT:    cmovnel %eax, %r11d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %r8d, %r11d
+; SSE-NEXT:    rep bsfq %rax, %rbx
+; SSE-NEXT:    rep bsfq 24(%rdi), %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %ebx, %r8d
+; SSE-NEXT:    subl $-128, %r8d
+; SSE-NEXT:    orq %r10, %r9
+; SSE-NEXT:    cmovnel %r11d, %r8d
 ; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    rep bsfq %r8, %r10
-; SSE-NEXT:    addl $64, %r10d
+; SSE-NEXT:    rep bsfq %rcx, %r9
+; SSE-NEXT:    addl $64, %r9d
 ; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %r10d
-; SSE-NEXT:    movq 32(%rdi), %rbx
-; SSE-NEXT:    subl $-128, %r10d
-; SSE-NEXT:    movq %rcx, %rax
-; SSE-NEXT:    orq %rsi, %rax
-; SSE-NEXT:    cmovnel %r11d, %r10d
-; SSE-NEXT:    rep bsfq %rbx, %rax
-; SSE-NEXT:    rep bsfq %r9, %r11
-; SSE-NEXT:    addl $64, %r11d
-; SSE-NEXT:    testq %rbx, %rbx
-; SSE-NEXT:    cmovnel %eax, %r11d
-; SSE-NEXT:    movq 48(%rdi), %r14
-; SSE-NEXT:    rep bsfq %r14, %r15
+; SSE-NEXT:    cmovnel %eax, %r9d
+; SSE-NEXT:    rep bsfq %rsi, %r10
 ; SSE-NEXT:    rep bsfq 56(%rdi), %rax
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    testq %rsi, %rsi
+; SSE-NEXT:    cmovnel %r10d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %rbx
-; SSE-NEXT:    cmovnel %r11d, %eax
+; SSE-NEXT:    orq %rcx, %rdx
+; SSE-NEXT:    cmovnel %r9d, %eax
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r8, %rsi
-; SSE-NEXT:    orq %rdx, %rcx
-; SSE-NEXT:    orq %rsi, %rcx
-; SSE-NEXT:    cmovnel %r10d, %eax
+; SSE-NEXT:    por 16(%rdi), %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %r8d, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_cttz_undef_i512:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 48(%rdi), %r10
-; AVX2-NEXT:    movq 40(%rdi), %r9
-; AVX2-NEXT:    movq 24(%rdi), %r8
-; AVX2-NEXT:    movq 16(%rdi), %rdx
-; AVX2-NEXT:    movq (%rdi), %rcx
-; AVX2-NEXT:    movq 8(%rdi), %rsi
-; AVX2-NEXT:    tzcntq %rcx, %rax
-; AVX2-NEXT:    xorl %ebx, %ebx
-; AVX2-NEXT:    tzcntq %rsi, %rbx
-; AVX2-NEXT:    addl $64, %ebx
-; AVX2-NEXT:    testq %rcx, %rcx
-; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    movq 40(%rdi), %rcx
+; AVX2-NEXT:    movq 32(%rdi), %rdx
+; AVX2-NEXT:    movq 16(%rdi), %rax
+; AVX2-NEXT:    movq (%rdi), %r8
+; AVX2-NEXT:    movq 8(%rdi), %r9
+; AVX2-NEXT:    tzcntq %r8, %rsi
+; AVX2-NEXT:    tzcntq %r9, %r10
+; AVX2-NEXT:    addl $64, %r10d
+; AVX2-NEXT:    testq %r8, %r8
+; AVX2-NEXT:    cmovnel %esi, %r10d
+; AVX2-NEXT:    tzcntq %rax, %r11
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    tzcntq 24(%rdi), %rsi
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %r10d, %esi
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq %rdx, %rax
-; AVX2-NEXT:    tzcntq %r8, %r11
-; AVX2-NEXT:    addl $64, %r11d
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
 ; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %r11d
-; AVX2-NEXT:    subl $-128, %r11d
-; AVX2-NEXT:    movq %rcx, %rax
-; AVX2-NEXT:    orq %rsi, %rax
-; AVX2-NEXT:    cmovnel %ebx, %r11d
-; AVX2-NEXT:    movq 32(%rdi), %rbx
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rbx, %rax
-; AVX2-NEXT:    xorl %r14d, %r14d
-; AVX2-NEXT:    tzcntq %r9, %r14
-; AVX2-NEXT:    addl $64, %r14d
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %eax, %r14d
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %r10, %r15
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 48(%rdi), %r9
+; AVX2-NEXT:    xorl %r10d, %r10d
+; AVX2-NEXT:    tzcntq %r9, %r10
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq 56(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %r10d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %r9, %rbx
-; AVX2-NEXT:    cmovnel %r14d, %eax
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa (%rdi), %xmm0
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r8, %rsi
-; AVX2-NEXT:    orq %rdx, %rcx
-; AVX2-NEXT:    orq %rsi, %rcx
-; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    vpor 16(%rdi), %xmm0, %xmm0
+; AVX2-NEXT:    vptest %xmm0, %xmm0
+; AVX2-NEXT:    cmovnel %esi, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_cttz_undef_i512:
@@ -7501,95 +6980,92 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
 ; SSE-NEXT:    pushq %r13
 ; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq %r9, %r14
-; SSE-NEXT:    movq %rcx, %rbx
-; SSE-NEXT:    movq %rdx, %r10
-; SSE-NEXT:    movq %rsi, %r9
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    movq %r9, %r10
+; SSE-NEXT:    movq %r8, %r9
+; SSE-NEXT:    movq %rcx, %r8
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE-NEXT:    rep bsfq %rdi, %rax
-; SSE-NEXT:    rep bsfq %rsi, %r12
-; SSE-NEXT:    addl $64, %r12d
+; SSE-NEXT:    rep bsfq %rdi, %r11
+; SSE-NEXT:    rep bsfq %rsi, %rbx
+; SSE-NEXT:    addl $64, %ebx
 ; SSE-NEXT:    testq %rdi, %rdi
-; SSE-NEXT:    cmovnel %eax, %r12d
-; SSE-NEXT:    rep bsfq %r10, %r15
-; SSE-NEXT:    rep bsfq %rbx, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %r15d, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %rdi, %r13
-; SSE-NEXT:    orq %rsi, %r13
-; SSE-NEXT:    cmovnel %r12d, %eax
-; SSE-NEXT:    movq %r8, %r15
-; SSE-NEXT:    rep bsfq %r8, %r12
-; SSE-NEXT:    movq %r14, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    rep bsfq %r14, %r13
-; SSE-NEXT:    addl $64, %r13d
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %r12d, %r13d
-; SSE-NEXT:    rep bsfq %rcx, %rbp
-; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    cmovnel %r11d, %ebx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    rep bsfq %rdx, %r15
+; SSE-NEXT:    rep bsfq %r8, %r14
+; SSE-NEXT:    addl $64, %r14d
+; SSE-NEXT:    testq %rdx, %rdx
+; SSE-NEXT:    cmovnel %r15d, %r14d
+; SSE-NEXT:    subl $-128, %r14d
+; SSE-NEXT:    movq %rdi, %r15
+; SSE-NEXT:    orq %rsi, %r15
+; SSE-NEXT:    cmovnel %ebx, %r14d
+; SSE-NEXT:    rep bsfq %r9, %rbx
+; SSE-NEXT:    rep bsfq %r10, %r12
 ; SSE-NEXT:    addl $64, %r12d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %ebx, %r12d
+; SSE-NEXT:    rep bsfq %rcx, %r15
+; SSE-NEXT:    rep bsfq %r11, %rbx
+; SSE-NEXT:    addl $64, %ebx
 ; SSE-NEXT:    testq %rcx, %rcx
-; SSE-NEXT:    cmovnel %ebp, %r12d
-; SSE-NEXT:    subl $-128, %r12d
-; SSE-NEXT:    movq %r8, %rbp
-; SSE-NEXT:    orq %r14, %rbp
-; SSE-NEXT:    cmovnel %r13d, %r12d
-; SSE-NEXT:    addl $256, %r12d # imm = 0x100
-; SSE-NEXT:    movq %rsi, %r13
-; SSE-NEXT:    orq %rbx, %r13
-; SSE-NEXT:    movq %rdi, %rbp
-; SSE-NEXT:    orq %r10, %rbp
-; SSE-NEXT:    orq %r13, %rbp
+; SSE-NEXT:    cmovnel %r15d, %ebx
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE-NEXT:    subl $-128, %ebx
+; SSE-NEXT:    movq %r9, %r13
+; SSE-NEXT:    orq %r10, %r13
+; SSE-NEXT:    cmovnel %r12d, %ebx
+; SSE-NEXT:    addl $256, %ebx # imm = 0x100
+; SSE-NEXT:    movq %rsi, %r12
+; SSE-NEXT:    orq %r8, %r12
+; SSE-NEXT:    movq %rdi, %r13
+; SSE-NEXT:    orq %rdx, %r13
+; SSE-NEXT:    orq %r12, %r13
+; SSE-NEXT:    cmovnel %r14d, %ebx
+; SSE-NEXT:    rep bsfq %rax, %r14
+; SSE-NEXT:    rep bsfq %r15, %r13
+; SSE-NEXT:    addl $64, %r13d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %r14d, %r13d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    rep bsfq %r12, %rbp
+; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %r14
+; SSE-NEXT:    addl $64, %r14d
+; SSE-NEXT:    testq %r12, %r12
+; SSE-NEXT:    cmovnel %ebp, %r14d
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE-NEXT:    subl $-128, %r14d
+; SSE-NEXT:    orq %r15, %rax
+; SSE-NEXT:    cmovnel %r13d, %r14d
+; SSE-NEXT:    rep bsfq %r12, %rax
+; SSE-NEXT:    rep bsfq %rbp, %r15
+; SSE-NEXT:    addl $64, %r15d
+; SSE-NEXT:    testq %r12, %r12
+; SSE-NEXT:    cmovnel %eax, %r15d
 ; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT:    cmovnel %eax, %r12d
-; SSE-NEXT:    rep bsfq %r11, %rbp
-; SSE-NEXT:    rep bsfq %r13, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r11, %r11
-; SSE-NEXT:    cmovnel %ebp, %eax
-; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rbp
-; SSE-NEXT:    addl $64, %ebp
-; SSE-NEXT:    rep bsfq %rdx, %rcx
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %ecx, %ebp
-; SSE-NEXT:    subl $-128, %ebp
-; SSE-NEXT:    movq %r11, %rcx
-; SSE-NEXT:    orq %r13, %rcx
-; SSE-NEXT:    cmovnel %eax, %ebp
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE-NEXT:    rep bsfq %r14, %rcx
-; SSE-NEXT:    addl $64, %ecx
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE-NEXT:    rep bsfq %rdx, %rax
-; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %ecx
 ; SSE-NEXT:    rep bsfq {{[0-9]+}}(%rsp), %rax
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE-NEXT:    rep bsfq %r8, %rsi
-; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %esi, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r14, %rdx
+; SSE-NEXT:    rep bsfq %r13, %rcx
+; SSE-NEXT:    testq %r13, %r13
 ; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r13
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r11
+; SSE-NEXT:    subl $-128, %eax
+; SSE-NEXT:    orq %rbp, %r12
+; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    movdqa {{[0-9]+}}(%rsp), %xmm0
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %r13, %r11
-; SSE-NEXT:    cmovnel %ebp, %eax
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rbx
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rbx, %r9
-; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; SSE-NEXT:    orq %r15, %rdi
-; SSE-NEXT:    orq %r10, %rdi
-; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    por {{[0-9]+}}(%rsp), %xmm0
+; SSE-NEXT:    ptest %xmm0, %xmm0
+; SSE-NEXT:    cmovnel %r14d, %eax
+; SSE-NEXT:    orq %r11, %r8
+; SSE-NEXT:    orq %r10, %rsi
+; SSE-NEXT:    orq %r8, %rsi
+; SSE-NEXT:    orq {{[0-9]+}}(%rsp), %rdx
 ; SSE-NEXT:    orq %r9, %rdi
-; SSE-NEXT:    cmovnel %r12d, %eax
+; SSE-NEXT:    orq %rdx, %rdi
+; SSE-NEXT:    addl $512, %eax # imm = 0x200
+; SSE-NEXT:    orq %rsi, %rdi
+; SSE-NEXT:    cmovnel %ebx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
 ; SSE-NEXT:    popq %r12
@@ -7607,111 +7083,108 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
 ; AVX2-NEXT:    pushq %r13
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq %r9, %rbx
-; AVX2-NEXT:    movq %r8, %r14
-; AVX2-NEXT:    movq %rcx, %r11
-; AVX2-NEXT:    movq %rdx, %r10
-; AVX2-NEXT:    movq %rsi, %r9
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT:    movq %r9, %r10
+; AVX2-NEXT:    movq %r8, %r9
+; AVX2-NEXT:    movq %rcx, %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    tzcntq %rdi, %rax
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %r9, %r15
-; AVX2-NEXT:    addl $64, %r15d
-; AVX2-NEXT:    testq %rdi, %rdi
-; AVX2-NEXT:    cmovnel %eax, %r15d
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %r10, %r12
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r11, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %r12d, %eax
-; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    movq %rdi, %r12
-; AVX2-NEXT:    orq %r9, %r12
-; AVX2-NEXT:    cmovnel %r15d, %eax
-; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %r14, %r15
-; AVX2-NEXT:    movq %rbx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rdi, %rbx
 ; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    tzcntq %rsi, %r12
 ; AVX2-NEXT:    addl $64, %r12d
-; AVX2-NEXT:    testq %r14, %r14
-; AVX2-NEXT:    cmovnel %r15d, %r12d
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %rcx, %r13
+; AVX2-NEXT:    testq %rdi, %rdi
+; AVX2-NEXT:    cmovnel %ebx, %r12d
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rdx, %rbx
 ; AVX2-NEXT:    xorl %r15d, %r15d
-; AVX2-NEXT:    tzcntq %rdx, %r15
+; AVX2-NEXT:    tzcntq %r8, %r15
 ; AVX2-NEXT:    addl $64, %r15d
-; AVX2-NEXT:    testq %rcx, %rcx
-; AVX2-NEXT:    cmovnel %r13d, %r15d
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    cmovnel %ebx, %r15d
 ; AVX2-NEXT:    subl $-128, %r15d
-; AVX2-NEXT:    movq %r14, %r13
-; AVX2-NEXT:    orq %rbx, %r13
+; AVX2-NEXT:    movq %rdi, %rbx
+; AVX2-NEXT:    orq %rsi, %rbx
 ; AVX2-NEXT:    cmovnel %r12d, %r15d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    addl $256, %r15d # imm = 0x100
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r10, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %ebx, %r12d
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %r11, %r13
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %rcx, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %r13d, %ebx
+; AVX2-NEXT:    subl $-128, %ebx
 ; AVX2-NEXT:    movq %r9, %r13
-; AVX2-NEXT:    orq %r11, %r13
-; AVX2-NEXT:    movq %rdi, %rbp
-; AVX2-NEXT:    orq %r10, %rbp
-; AVX2-NEXT:    orq %r13, %rbp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    cmovnel %eax, %r15d
-; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    tzcntq %r12, %rbp
-; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r13, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r12, %r12
-; AVX2-NEXT:    cmovnel %ebp, %eax
+; AVX2-NEXT:    orq %r10, %r13
+; AVX2-NEXT:    cmovnel %r12d, %ebx
+; AVX2-NEXT:    addl $256, %ebx # imm = 0x100
+; AVX2-NEXT:    movq %rsi, %r12
+; AVX2-NEXT:    orq %r8, %r12
+; AVX2-NEXT:    movq %rdi, %r13
+; AVX2-NEXT:    orq %rdx, %r13
+; AVX2-NEXT:    orq %r12, %r13
+; AVX2-NEXT:    cmovnel %r15d, %ebx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq %rax, %r15
+; AVX2-NEXT:    xorl %r13d, %r13d
+; AVX2-NEXT:    tzcntq %r12, %r13
+; AVX2-NEXT:    addl $64, %r13d
+; AVX2-NEXT:    testq %rax, %rax
+; AVX2-NEXT:    cmovnel %r15d, %r13d
 ; AVX2-NEXT:    xorl %ebp, %ebp
-; AVX2-NEXT:    tzcntq %r8, %rbp
-; AVX2-NEXT:    addl $64, %ebp
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %rsi, %rcx
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %ecx, %ebp
-; AVX2-NEXT:    subl $-128, %ebp
-; AVX2-NEXT:    movq %r12, %rcx
-; AVX2-NEXT:    orq %r13, %rcx
-; AVX2-NEXT:    cmovnel %eax, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %rbx, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    tzcntq %r14, %rbp
+; AVX2-NEXT:    xorl %r15d, %r15d
+; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    addl $64, %r15d
+; AVX2-NEXT:    testq %r14, %r14
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    cmovnel %ebp, %r15d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX2-NEXT:    subl $-128, %r15d
+; AVX2-NEXT:    orq %r12, %rax
+; AVX2-NEXT:    cmovnel %r13d, %r15d
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rdx, %rax
-; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ecx
+; AVX2-NEXT:    tzcntq %rbp, %rax
+; AVX2-NEXT:    xorl %r12d, %r12d
+; AVX2-NEXT:    tzcntq %r14, %r12
+; AVX2-NEXT:    addl $64, %r12d
+; AVX2-NEXT:    testq %rbp, %rbp
+; AVX2-NEXT:    cmovnel %eax, %r12d
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    tzcntq %r8, %rsi
-; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %esi, %eax
-; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %rbx, %rdx
+; AVX2-NEXT:    xorl %ecx, %ecx
+; AVX2-NEXT:    tzcntq %r13, %rcx
+; AVX2-NEXT:    testq %r13, %r13
 ; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    subl $-128, %eax
+; AVX2-NEXT:    orq %r14, %rbp
+; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    vmovdqa {{[0-9]+}}(%rsp), %xmm0
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r13, %r12
-; AVX2-NEXT:    cmovnel %ebp, %eax
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r9 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %r11, %r9
-; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    orq %r14, %rdi
-; AVX2-NEXT:    orq %r10, %rdi
-; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r9, %rdi
+; AVX2-NEXT:    vpor {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT:    vptest %xmm0, %xmm0
 ; AVX2-NEXT:    cmovnel %r15d, %eax
+; AVX2-NEXT:    orq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    orq %r10, %rsi
+; AVX2-NEXT:    orq %r8, %rsi
+; AVX2-NEXT:    orq %r11, %rdx
+; AVX2-NEXT:    orq %r9, %rdi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    addl $512, %eax # imm = 0x200
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    cmovnel %ebx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
@@ -7898,384 +7371,296 @@ define i32 @test_cttz_undef_i1024(i1024 %a0) nounwind {
 define i32 @load_cttz_undef_i1024(ptr %p0) nounwind {
 ; SSE-LABEL: load_cttz_undef_i1024:
 ; SSE:       # %bb.0:
-; SSE-NEXT:    pushq %rbp
-; SSE-NEXT:    pushq %r15
-; SSE-NEXT:    pushq %r14
-; SSE-NEXT:    pushq %r13
-; SSE-NEXT:    pushq %r12
 ; SSE-NEXT:    pushq %rbx
-; SSE-NEXT:    movq 72(%rdi), %rbx
-; SSE-NEXT:    movq 56(%rdi), %r9
-; SSE-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 48(%rdi), %rcx
-; SSE-NEXT:    movq 40(%rdi), %r10
-; SSE-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    movq 32(%rdi), %rsi
-; SSE-NEXT:    movq 24(%rdi), %rbp
+; SSE-NEXT:    movq 104(%rdi), %rcx
+; SSE-NEXT:    movq 40(%rdi), %rax
+; SSE-NEXT:    movq 16(%rdi), %rdx
 ; SSE-NEXT:    movq (%rdi), %r8
-; SSE-NEXT:    movq 8(%rdi), %r11
-; SSE-NEXT:    rep bsfq %r8, %rax
-; SSE-NEXT:    rep bsfq %r11, %rdx
-; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    movq 8(%rdi), %r9
+; SSE-NEXT:    rep bsfq %r8, %rsi
+; SSE-NEXT:    rep bsfq %r9, %r11
+; SSE-NEXT:    addl $64, %r11d
 ; SSE-NEXT:    testq %r8, %r8
-; SSE-NEXT:    cmovnel %eax, %edx
-; SSE-NEXT:    movq 16(%rdi), %r14
-; SSE-NEXT:    rep bsfq %r14, %r15
-; SSE-NEXT:    rep bsfq %rbp, %rax
-; SSE-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r14, %r14
-; SSE-NEXT:    cmovnel %r15d, %eax
-; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    movq %r8, %r15
-; SSE-NEXT:    orq %r11, %r15
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    rep bsfq %rsi, %rdx
-; SSE-NEXT:    rep bsfq %r10, %r13
-; SSE-NEXT:    addl $64, %r13d
-; SSE-NEXT:    testq %rsi, %rsi
-; SSE-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    cmovnel %edx, %r13d
-; SSE-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE-NEXT:    rep bsfq %rcx, %rdx
-; SSE-NEXT:    rep bsfq %r9, %r15
-; SSE-NEXT:    addl $64, %r15d
-; SSE-NEXT:    testq %rcx, %rcx
-; SSE-NEXT:    cmovnel %edx, %r15d
-; SSE-NEXT:    movq 64(%rdi), %r12
-; SSE-NEXT:    subl $-128, %r15d
-; SSE-NEXT:    movq %rsi, %rdx
-; SSE-NEXT:    orq %r10, %rdx
-; SSE-NEXT:    cmovnel %r13d, %r15d
-; SSE-NEXT:    addl $256, %r15d # imm = 0x100
-; SSE-NEXT:    movq %r11, %rdx
-; SSE-NEXT:    orq %rbp, %rdx
-; SSE-NEXT:    movq %r8, %r13
-; SSE-NEXT:    orq %r14, %r13
-; SSE-NEXT:    orq %rdx, %r13
-; SSE-NEXT:    cmovnel %eax, %r15d
-; SSE-NEXT:    rep bsfq %r12, %rdx
-; SSE-NEXT:    rep bsfq %rbx, %rax
-; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    testq %r12, %r12
-; SSE-NEXT:    cmovnel %edx, %eax
-; SSE-NEXT:    movq 88(%rdi), %rbp
-; SSE-NEXT:    rep bsfq %rbp, %r13
-; SSE-NEXT:    addl $64, %r13d
-; SSE-NEXT:    movq 80(%rdi), %r10
-; SSE-NEXT:    rep bsfq %r10, %rcx
-; SSE-NEXT:    testq %r10, %r10
-; SSE-NEXT:    cmovnel %ecx, %r13d
-; SSE-NEXT:    subl $-128, %r13d
-; SSE-NEXT:    movq %r12, %rcx
-; SSE-NEXT:    orq %rbx, %rcx
-; SSE-NEXT:    cmovnel %eax, %r13d
-; SSE-NEXT:    movq 104(%rdi), %r9
-; SSE-NEXT:    rep bsfq %r9, %rcx
-; SSE-NEXT:    addl $64, %ecx
-; SSE-NEXT:    movq 96(%rdi), %rdx
-; SSE-NEXT:    rep bsfq %rdx, %rax
+; SSE-NEXT:    cmovnel %esi, %r11d
+; SSE-NEXT:    rep bsfq 24(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %rdx, %rbx
+; SSE-NEXT:    addl $64, %esi
 ; SSE-NEXT:    testq %rdx, %rdx
-; SSE-NEXT:    cmovnel %eax, %ecx
+; SSE-NEXT:    movq 32(%rdi), %r10
+; SSE-NEXT:    cmovnel %ebx, %esi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r9, %r8
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    rep bsfq %r10, %rdx
+; SSE-NEXT:    rep bsfq %rax, %r11
+; SSE-NEXT:    addl $64, %r11d
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %edx, %r11d
+; SSE-NEXT:    movq 48(%rdi), %r8
+; SSE-NEXT:    rep bsfq %r8, %r9
+; SSE-NEXT:    rep bsfq 56(%rdi), %rdx
+; SSE-NEXT:    addl $64, %edx
+; SSE-NEXT:    testq %r8, %r8
+; SSE-NEXT:    cmovnel %r9d, %edx
+; SSE-NEXT:    movq 80(%rdi), %r9
+; SSE-NEXT:    movq 72(%rdi), %r8
+; SSE-NEXT:    subl $-128, %edx
+; SSE-NEXT:    orq %rax, %r10
+; SSE-NEXT:    movq 64(%rdi), %rax
+; SSE-NEXT:    movdqa 16(%rdi), %xmm0
+; SSE-NEXT:    cmovnel %r11d, %edx
+; SSE-NEXT:    movdqa (%rdi), %xmm1
+; SSE-NEXT:    addl $256, %edx # imm = 0x100
+; SSE-NEXT:    movdqa %xmm1, %xmm2
+; SSE-NEXT:    por %xmm0, %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %edx
+; SSE-NEXT:    rep bsfq %rax, %rsi
+; SSE-NEXT:    rep bsfq %r8, %r10
+; SSE-NEXT:    addl $64, %r10d
+; SSE-NEXT:    testq %rax, %rax
+; SSE-NEXT:    cmovnel %esi, %r10d
+; SSE-NEXT:    rep bsfq 88(%rdi), %rsi
+; SSE-NEXT:    rep bsfq %r9, %r11
+; SSE-NEXT:    addl $64, %esi
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    movq 96(%rdi), %r9
+; SSE-NEXT:    cmovnel %r11d, %esi
+; SSE-NEXT:    subl $-128, %esi
+; SSE-NEXT:    orq %r8, %rax
+; SSE-NEXT:    cmovnel %r10d, %esi
+; SSE-NEXT:    rep bsfq %r9, %rax
+; SSE-NEXT:    rep bsfq %rcx, %r8
+; SSE-NEXT:    addl $64, %r8d
+; SSE-NEXT:    testq %r9, %r9
+; SSE-NEXT:    cmovnel %eax, %r8d
+; SSE-NEXT:    movq 112(%rdi), %r10
+; SSE-NEXT:    rep bsfq %r10, %r11
 ; SSE-NEXT:    rep bsfq 120(%rdi), %rax
-; SSE-NEXT:    movq 112(%rdi), %rdi
 ; SSE-NEXT:    addl $64, %eax
-; SSE-NEXT:    rep bsfq %rdi, %rsi
-; SSE-NEXT:    testq %rdi, %rdi
-; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    testq %r10, %r10
+; SSE-NEXT:    cmovnel %r11d, %eax
 ; SSE-NEXT:    subl $-128, %eax
-; SSE-NEXT:    orq %r9, %rdx
-; SSE-NEXT:    cmovnel %ecx, %eax
-; SSE-NEXT:    orq %rbp, %rbx
-; SSE-NEXT:    orq %r10, %r12
+; SSE-NEXT:    orq %rcx, %r9
+; SSE-NEXT:    cmovnel %r8d, %eax
+; SSE-NEXT:    movdqa 64(%rdi), %xmm2
 ; SSE-NEXT:    addl $256, %eax # imm = 0x100
-; SSE-NEXT:    orq %rbx, %r12
-; SSE-NEXT:    cmovnel %r13d, %eax
-; SSE-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; SSE-NEXT:    orq %rcx, %r11
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Folded Reload
-; SSE-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; SSE-NEXT:    orq %r14, %r8
+; SSE-NEXT:    por 80(%rdi), %xmm2
+; SSE-NEXT:    ptest %xmm2, %xmm2
+; SSE-NEXT:    cmovnel %esi, %eax
+; SSE-NEXT:    por 48(%rdi), %xmm0
+; SSE-NEXT:    por 32(%rdi), %xmm1
 ; SSE-NEXT:    addl $512, %eax # imm = 0x200
-; SSE-NEXT:    orq %r11, %r8
-; SSE-NEXT:    cmovnel %r15d, %eax
+; SSE-NEXT:    por %xmm0, %xmm1
+; SSE-NEXT:    ptest %xmm1, %xmm1
+; SSE-NEXT:    cmovnel %edx, %eax
 ; SSE-NEXT:    # kill: def $eax killed $eax killed $rax
 ; SSE-NEXT:    popq %rbx
-; SSE-NEXT:    popq %r12
-; SSE-NEXT:    popq %r13
-; SSE-NEXT:    popq %r14
-; SSE-NEXT:    popq %r15
-; SSE-NEXT:    popq %rbp
 ; SSE-NEXT:    retq
 ;
 ; AVX2-LABEL: load_cttz_undef_i1024:
 ; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    movq 72(%rdi), %r14
-; AVX2-NEXT:    movq 64(%rdi), %r15
-; AVX2-NEXT:    movq 56(%rdi), %r9
-; AVX2-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 48(%rdi), %rcx
-; AVX2-NEXT:    movq 40(%rdi), %r10
-; AVX2-NEXT:    movq %r10, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq 32(%rdi), %rsi
-; AVX2-NEXT:    movq 24(%rdi), %rbp
-; AVX2-NEXT:    movq 16(%rdi), %rbx
+; AVX2-NEXT:    movq 104(%rdi), %rcx
+; AVX2-NEXT:    movq 48(%rdi), %rsi
+; AVX2-NEXT:    movq 16(%rdi), %rdx
 ; AVX2-NEXT:    movq (%rdi), %r8
-; AVX2-NEXT:    movq 8(%rdi), %r11
+; AVX2-NEXT:    movq 8(%rdi), %r9
 ; AVX2-NEXT:    tzcntq %r8, %rax
-; AVX2-NEXT:    tzcntq %r11, %rdx
-; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r9, %rbx
+; AVX2-NEXT:    addl $64, %ebx
 ; AVX2-NEXT:    testq %r8, %r8
-; AVX2-NEXT:    cmovnel %eax, %edx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %rbx, %r12
+; AVX2-NEXT:    cmovnel %eax, %ebx
+; AVX2-NEXT:    tzcntq %rdx, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rbp, %rax
-; AVX2-NEXT:    movq %rbp, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    tzcntq 24(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %rbx, %rbx
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    testq %rdx, %rdx
+; AVX2-NEXT:    movq 40(%rdi), %r10
+; AVX2-NEXT:    cmovnel %r11d, %eax
+; AVX2-NEXT:    movq 32(%rdi), %r11
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    movq %r8, %r12
-; AVX2-NEXT:    orq %r11, %r12
-; AVX2-NEXT:    cmovnel %edx, %eax
-; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    tzcntq %rsi, %rdx
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %r10, %r13
-; AVX2-NEXT:    addl $64, %r13d
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    cmovnel %edx, %r13d
-; AVX2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    orq %r9, %r8
+; AVX2-NEXT:    cmovnel %ebx, %eax
 ; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    tzcntq %rcx, %rdx
-; AVX2-NEXT:    xorl %r12d, %r12d
-; AVX2-NEXT:    tzcntq %r9, %r12
-; AVX2-NEXT:    addl $64, %r12d
-; AVX2-NEXT:    testq %rcx, %rcx
-; AVX2-NEXT:    cmovnel %edx, %r12d
-; AVX2-NEXT:    subl $-128, %r12d
-; AVX2-NEXT:    movq %rsi, %rdx
-; AVX2-NEXT:    orq %r10, %rdx
-; AVX2-NEXT:    cmovnel %r13d, %r12d
-; AVX2-NEXT:    addl $256, %r12d # imm = 0x100
-; AVX2-NEXT:    movq %r11, %rdx
-; AVX2-NEXT:    orq %rbp, %rdx
-; AVX2-NEXT:    movq %r8, %r13
-; AVX2-NEXT:    orq %rbx, %r13
-; AVX2-NEXT:    orq %rdx, %r13
-; AVX2-NEXT:    cmovnel %eax, %r12d
+; AVX2-NEXT:    tzcntq %r11, %rdx
+; AVX2-NEXT:    xorl %ebx, %ebx
+; AVX2-NEXT:    tzcntq %r10, %rbx
+; AVX2-NEXT:    addl $64, %ebx
+; AVX2-NEXT:    testq %r11, %r11
+; AVX2-NEXT:    cmovnel %edx, %ebx
 ; AVX2-NEXT:    xorl %edx, %edx
-; AVX2-NEXT:    tzcntq %r15, %rdx
+; AVX2-NEXT:    tzcntq 56(%rdi), %rdx
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rsi, %r8
+; AVX2-NEXT:    addl $64, %edx
+; AVX2-NEXT:    testq %rsi, %rsi
+; AVX2-NEXT:    movq 80(%rdi), %r9
+; AVX2-NEXT:    cmovnel %r8d, %edx
+; AVX2-NEXT:    movq 72(%rdi), %r8
+; AVX2-NEXT:    subl $-128, %edx
+; AVX2-NEXT:    orq %r10, %r11
+; AVX2-NEXT:    movq 64(%rdi), %r10
+; AVX2-NEXT:    cmovnel %ebx, %edx
+; AVX2-NEXT:    vmovdqu (%rdi), %ymm0
+; AVX2-NEXT:    addl $256, %edx # imm = 0x100
+; AVX2-NEXT:    vpor 16(%rdi), %xmm0, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %eax, %edx
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %r14, %rax
-; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    testq %r15, %r15
-; AVX2-NEXT:    cmovnel %edx, %eax
-; AVX2-NEXT:    movq 88(%rdi), %rbp
-; AVX2-NEXT:    xorl %r13d, %r13d
-; AVX2-NEXT:    tzcntq %rbp, %r13
-; AVX2-NEXT:    addl $64, %r13d
-; AVX2-NEXT:    movq 80(%rdi), %r10
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %r10, %rcx
+; AVX2-NEXT:    tzcntq %r10, %rax
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    tzcntq %r8, %r11
+; AVX2-NEXT:    addl $64, %r11d
 ; AVX2-NEXT:    testq %r10, %r10
-; AVX2-NEXT:    cmovnel %ecx, %r13d
-; AVX2-NEXT:    subl $-128, %r13d
-; AVX2-NEXT:    movq %r15, %rcx
-; AVX2-NEXT:    orq %r14, %rcx
-; AVX2-NEXT:    cmovnel %eax, %r13d
-; AVX2-NEXT:    movq 104(%rdi), %r9
-; AVX2-NEXT:    xorl %ecx, %ecx
-; AVX2-NEXT:    tzcntq %r9, %rcx
-; AVX2-NEXT:    addl $64, %ecx
-; AVX2-NEXT:    movq 96(%rdi), %rdx
+; AVX2-NEXT:    cmovnel %eax, %r11d
+; AVX2-NEXT:    xorl %esi, %esi
+; AVX2-NEXT:    tzcntq 88(%rdi), %rsi
 ; AVX2-NEXT:    xorl %eax, %eax
-; AVX2-NEXT:    tzcntq %rdx, %rax
-; AVX2-NEXT:    testq %rdx, %rdx
-; AVX2-NEXT:    cmovnel %eax, %ecx
-; AVX2-NEXT:    movq 112(%rdi), %rsi
+; AVX2-NEXT:    tzcntq %r9, %rax
+; AVX2-NEXT:    addl $64, %esi
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    movq 96(%rdi), %r9
+; AVX2-NEXT:    cmovnel %eax, %esi
+; AVX2-NEXT:    subl $-128, %esi
+; AVX2-NEXT:    orq %r8, %r10
+; AVX2-NEXT:    cmovnel %r11d, %esi
+; AVX2-NEXT:    xorl %eax, %eax
+; AVX2-NEXT:    tzcntq %r9, %rax
+; AVX2-NEXT:    xorl %r8d, %r8d
+; AVX2-NEXT:    tzcntq %rcx, %r8
+; AVX2-NEXT:    addl $64, %r8d
+; AVX2-NEXT:    testq %r9, %r9
+; AVX2-NEXT:    cmovnel %eax, %r8d
+; AVX2-NEXT:    movq 112(%rdi), %r10
+; AVX2-NEXT:    xorl %r11d, %r11d
+; AVX2-NEXT:    tzcntq %r10, %r11
 ; AVX2-NEXT:    xorl %eax, %eax
 ; AVX2-NEXT:    tzcntq 120(%rdi), %rax
 ; AVX2-NEXT:    addl $64, %eax
-; AVX2-NEXT:    tzcntq %rsi, %rdi
-; AVX2-NEXT:    testq %rsi, %rsi
-; AVX2-NEXT:    cmovnel %edi, %eax
+; AVX2-NEXT:    testq %r10, %r10
+; AVX2-NEXT:    cmovnel %r11d, %eax
 ; AVX2-NEXT:    subl $-128, %eax
-; AVX2-NEXT:    orq %r9, %rdx
-; AVX2-NEXT:    cmovnel %ecx, %eax
-; AVX2-NEXT:    orq %rbp, %r14
-; AVX2-NEXT:    orq %r10, %r15
+; AVX2-NEXT:    orq %rcx, %r9
+; AVX2-NEXT:    cmovnel %r8d, %eax
+; AVX2-NEXT:    vmovdqa 64(%rdi), %xmm1
 ; AVX2-NEXT:    addl $256, %eax # imm = 0x100
-; AVX2-NEXT:    orq %r14, %r15
-; AVX2-NEXT:    cmovnel %r13d, %eax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rcx, %r11
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Folded Reload
-; AVX2-NEXT:    orq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Folded Reload
-; AVX2-NEXT:    orq %rbx, %r8
+; AVX2-NEXT:    vpor 80(%rdi), %xmm1, %xmm1
+; AVX2-NEXT:    vptest %xmm1, %xmm1
+; AVX2-NEXT:    cmovnel %esi, %eax
+; AVX2-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
 ; AVX2-NEXT:    addl $512, %eax # imm = 0x200
-; AVX2-NEXT:    orq %r11, %r8
-; AVX2-NEXT:    cmovnel %r12d, %eax
+; AVX2-NEXT:    vptest %ymm0, %ymm0
+; AVX2-NEXT:    cmovnel %edx, %eax
 ; AVX2-NEXT:    # kill: def $eax killed $eax killed $rax
 ; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    vzeroupper
 ; AVX2-NEXT:    retq
 ;
 ; AVX512F-LABEL: load_cttz_undef_i1024:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512F-NEXT:    movq 16(%rdi), %rax
-; AVX512F-NEXT:    movq (%rdi), %rcx
-; AVX512F-NEXT:    movq 8(%rdi), %rdx
-; AVX512F-NEXT:    movq 24(%rdi), %rsi
-; AVX512F-NEXT:    orq 56(%rdi), %rsi
-; AVX512F-NEXT:    orq 40(%rdi), %rdx
-; AVX512F-NEXT:    orq 48(%rdi), %rax
-; AVX512F-NEXT:    orq %rsi, %rdx
-; AVX512F-NEXT:    orq 32(%rdi), %rcx
-; AVX512F-NEXT:    orq %rax, %rcx
+; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512F-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512F-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512F-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512F-NEXT:    vplzcntq %zmm3, %zmm3
 ; AVX512F-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
 ; AVX512F-NEXT:    vpsubq %zmm3, %zmm4, %zmm3
-; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512F-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512F-NEXT:    vmovd %xmm1, %esi
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512F-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512F-NEXT:    vplzcntq %zmm1, %zmm1
-; AVX512F-NEXT:    vpsubq %zmm1, %zmm4, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512F-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512F-NEXT:    vmovd %xmm0, %eax
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm3, %ecx
+; AVX512F-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512F-NEXT:    vplzcntq %zmm2, %zmm2
+; AVX512F-NEXT:    vpsubq %zmm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vmovd %xmm1, %eax
 ; AVX512F-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512F-NEXT:    orq %rdx, %rcx
-; AVX512F-NEXT:    cmovnel %esi, %eax
+; AVX512F-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512F-NEXT:    vptest %ymm0, %ymm0
+; AVX512F-NEXT:    cmovnel %ecx, %eax
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512POPCNT-LABEL: load_cttz_undef_i1024:
 ; AVX512POPCNT:       # %bb.0:
-; AVX512POPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512POPCNT-NEXT:    movq 16(%rdi), %rax
-; AVX512POPCNT-NEXT:    movq (%rdi), %rcx
-; AVX512POPCNT-NEXT:    movq 8(%rdi), %rdx
-; AVX512POPCNT-NEXT:    movq 24(%rdi), %rsi
-; AVX512POPCNT-NEXT:    orq 56(%rdi), %rsi
-; AVX512POPCNT-NEXT:    orq 40(%rdi), %rdx
-; AVX512POPCNT-NEXT:    orq 48(%rdi), %rax
-; AVX512POPCNT-NEXT:    orq %rsi, %rdx
-; AVX512POPCNT-NEXT:    orq 32(%rdi), %rcx
-; AVX512POPCNT-NEXT:    orq %rax, %rcx
+; AVX512POPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512POPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512POPCNT-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512POPCNT-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512POPCNT-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512POPCNT-NEXT:    vpopcntq %zmm3, %zmm3
 ; AVX512POPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
 ; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
-; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512POPCNT-NEXT:    vmovd %xmm1, %esi
-; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512POPCNT-NEXT:    vpopcntq %zmm1, %zmm1
-; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm1, %zmm1
 ; AVX512POPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512POPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512POPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512POPCNT-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm3, %ecx
+; AVX512POPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512POPCNT-NEXT:    vpopcntq %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
+; AVX512POPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512POPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512POPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512POPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512POPCNT-NEXT:    orq %rdx, %rcx
-; AVX512POPCNT-NEXT:    cmovnel %esi, %eax
+; AVX512POPCNT-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512POPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512POPCNT-NEXT:    retq
 ;
 ; AVX512VL-LABEL: load_cttz_undef_i1024:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512VL-NEXT:    movq 16(%rdi), %rax
-; AVX512VL-NEXT:    movq (%rdi), %rcx
-; AVX512VL-NEXT:    movq 8(%rdi), %rdx
-; AVX512VL-NEXT:    movq 24(%rdi), %rsi
-; AVX512VL-NEXT:    orq 56(%rdi), %rsi
-; AVX512VL-NEXT:    orq 40(%rdi), %rdx
-; AVX512VL-NEXT:    orq 48(%rdi), %rax
-; AVX512VL-NEXT:    orq 32(%rdi), %rcx
-; AVX512VL-NEXT:    orq %rsi, %rdx
-; AVX512VL-NEXT:    orq %rax, %rcx
+; AVX512VL-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VL-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VL-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VL-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512VL-NEXT:    vplzcntq %zmm3, %zmm3
 ; AVX512VL-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [64,128,192,256,320,384,448,512]
 ; AVX512VL-NEXT:    vpsubq %zmm3, %zmm4, %zmm3
-; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm1, %esi
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VL-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VL-NEXT:    vplzcntq %zmm1, %zmm1
-; AVX512VL-NEXT:    vpsubq %zmm1, %zmm4, %zmm1
 ; AVX512VL-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VL-NEXT:    vmovd %xmm0, %eax
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm3, %ecx
+; AVX512VL-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VL-NEXT:    vplzcntq %zmm2, %zmm2
+; AVX512VL-NEXT:    vpsubq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vmovd %xmm1, %eax
 ; AVX512VL-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VL-NEXT:    orq %rdx, %rcx
-; AVX512VL-NEXT:    cmovnel %esi, %eax
+; AVX512VL-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VL-NEXT:    vptest %ymm0, %ymm0
+; AVX512VL-NEXT:    cmovnel %ecx, %eax
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VLPOPCNT-LABEL: load_cttz_undef_i1024:
 ; AVX512VLPOPCNT:       # %bb.0:
-; AVX512VLPOPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm0
-; AVX512VLPOPCNT-NEXT:    vmovdqu64 (%rdi), %zmm1
-; AVX512VLPOPCNT-NEXT:    movq 16(%rdi), %rax
-; AVX512VLPOPCNT-NEXT:    movq (%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    movq 8(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    movq 24(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    orq 56(%rdi), %rsi
-; AVX512VLPOPCNT-NEXT:    orq 40(%rdi), %rdx
-; AVX512VLPOPCNT-NEXT:    orq 48(%rdi), %rax
-; AVX512VLPOPCNT-NEXT:    orq 32(%rdi), %rcx
-; AVX512VLPOPCNT-NEXT:    orq %rsi, %rdx
-; AVX512VLPOPCNT-NEXT:    orq %rax, %rcx
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 (%rdi), %zmm0
+; AVX512VLPOPCNT-NEXT:    vmovdqu64 64(%rdi), %zmm1
 ; AVX512VLPOPCNT-NEXT:    vpternlogd {{.*#+}} zmm2 = -1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm3
-; AVX512VLPOPCNT-NEXT:    vpandnq %zmm3, %zmm1, %zmm3
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm3
+; AVX512VLPOPCNT-NEXT:    vpandnq %zmm3, %zmm0, %zmm3
 ; AVX512VLPOPCNT-NEXT:    vpopcntq %zmm3, %zmm3
 ; AVX512VLPOPCNT-NEXT:    vmovdqa64 {{.*#+}} zmm4 = [0,64,128,192,256,320,384,448]
 ; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm3, %zmm3
-; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm3, %zmm1 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %esi
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpandnq %zmm1, %zmm0, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpopcntq %zmm1, %zmm1
-; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm1, %zmm1
 ; AVX512VLPOPCNT-NEXT:    vptestmq %zmm0, %zmm0, %k1
-; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm1, %zmm0 {%k1} {z}
-; AVX512VLPOPCNT-NEXT:    vmovd %xmm0, %eax
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm3, %ecx
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpandnq %zmm2, %zmm1, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpopcntq %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vpaddq %zmm4, %zmm2, %zmm2
+; AVX512VLPOPCNT-NEXT:    vptestmq %zmm1, %zmm1, %k1
+; AVX512VLPOPCNT-NEXT:    vpcompressq %zmm2, %zmm1 {%k1} {z}
+; AVX512VLPOPCNT-NEXT:    vmovd %xmm1, %eax
 ; AVX512VLPOPCNT-NEXT:    addl $512, %eax # imm = 0x200
-; AVX512VLPOPCNT-NEXT:    orq %rdx, %rcx
-; AVX512VLPOPCNT-NEXT:    cmovnel %esi, %eax
+; AVX512VLPOPCNT-NEXT:    vpor 32(%rdi), %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    vptest %ymm0, %ymm0
+; AVX512VLPOPCNT-NEXT:    cmovnel %ecx, %eax
 ; AVX512VLPOPCNT-NEXT:    vzeroupper
 ; AVX512VLPOPCNT-NEXT:    retq
   %a0 = load i1024, ptr %p0
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i256.ll b/llvm/test/CodeGen/X86/funnel-shift-i256.ll
index 549b6e3fc0dd9..cc0c1ef23c3a5 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i256.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i256.ll
@@ -760,107 +760,71 @@ define i256 @fshl_rot_i256_load(ptr %p0, i256 %a2) nounwind {
 ;
 ; AVX512F-LABEL: fshl_rot_i256_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq %rdx, %rcx
-; AVX512F-NEXT:    movq 16(%rsi), %rdx
-; AVX512F-NEXT:    movq (%rsi), %r9
-; AVX512F-NEXT:    movq 8(%rsi), %r8
-; AVX512F-NEXT:    movq 24(%rsi), %rax
-; AVX512F-NEXT:    testb $-128, %cl
-; AVX512F-NEXT:    movq %rax, %rsi
-; AVX512F-NEXT:    cmovneq %r8, %rsi
-; AVX512F-NEXT:    movq %r9, %r10
-; AVX512F-NEXT:    cmovneq %rdx, %r10
-; AVX512F-NEXT:    cmovneq %rax, %r8
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    cmovneq %r9, %rdx
-; AVX512F-NEXT:    testb $64, %cl
-; AVX512F-NEXT:    movq %rdx, %rdi
-; AVX512F-NEXT:    cmovneq %r8, %rdi
-; AVX512F-NEXT:    cmovneq %r10, %r8
-; AVX512F-NEXT:    cmoveq %rsi, %rdx
-; AVX512F-NEXT:    cmovneq %rsi, %r10
-; AVX512F-NEXT:    movq %r10, %rsi
-; AVX512F-NEXT:    shldq %cl, %rdx, %rsi
-; AVX512F-NEXT:    movq %r8, %r9
-; AVX512F-NEXT:    shldq %cl, %r10, %r9
-; AVX512F-NEXT:    movq %rdi, %r10
-; AVX512F-NEXT:    shldq %cl, %r8, %r10
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
-; AVX512F-NEXT:    movq %rdx, 24(%rax)
-; AVX512F-NEXT:    movq %r10, 16(%rax)
-; AVX512F-NEXT:    movq %r9, 8(%rax)
-; AVX512F-NEXT:    movq %rsi, (%rax)
+; AVX512F-NEXT:    movzbl %dl, %ecx
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    shrl $6, %ecx
+; AVX512F-NEXT:    movl $-1, %edx
+; AVX512F-NEXT:    shlxl %ecx, %edx, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpexpandq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    valignq {{.*#+}} zmm4 = zmm3[7,0,1,2,3,4,5,6]
+; AVX512F-NEXT:    vpsrlq $1, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsrlq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512F-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
+; AVX512F-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fshl_rot_i256_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    movq %rdx, %rcx
-; AVX512VL-NEXT:    movq 16(%rsi), %rdx
-; AVX512VL-NEXT:    movq (%rsi), %r8
-; AVX512VL-NEXT:    movq 8(%rsi), %rax
-; AVX512VL-NEXT:    movq 24(%rsi), %rsi
-; AVX512VL-NEXT:    testb $-128, %cl
-; AVX512VL-NEXT:    movq %rsi, %r9
-; AVX512VL-NEXT:    cmovneq %rax, %r9
-; AVX512VL-NEXT:    movq %r8, %r10
-; AVX512VL-NEXT:    cmovneq %rdx, %r10
-; AVX512VL-NEXT:    cmovneq %rsi, %rax
-; AVX512VL-NEXT:    cmovneq %r8, %rdx
-; AVX512VL-NEXT:    testb $64, %cl
-; AVX512VL-NEXT:    movq %rdx, %rsi
-; AVX512VL-NEXT:    cmovneq %rax, %rsi
-; AVX512VL-NEXT:    cmovneq %r10, %rax
-; AVX512VL-NEXT:    cmoveq %r9, %rdx
-; AVX512VL-NEXT:    cmovneq %r9, %r10
-; AVX512VL-NEXT:    movq %r10, %r8
-; AVX512VL-NEXT:    shldq %cl, %rdx, %r8
-; AVX512VL-NEXT:    movq %rax, %r9
-; AVX512VL-NEXT:    shldq %cl, %r10, %r9
-; AVX512VL-NEXT:    movq %rsi, %r10
-; AVX512VL-NEXT:    shldq %cl, %rax, %r10
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shldq %cl, %rsi, %rdx
+; AVX512VL-NEXT:    movzbl %dl, %eax
+; AVX512VL-NEXT:    vpbroadcastq %rax, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    shrl $6, %eax
+; AVX512VL-NEXT:    movl $-1, %ecx
+; AVX512VL-NEXT:    shlxl %eax, %ecx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT:    vpexpandq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm4 = zmm3[7,0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpsrlq $1, %zmm4, %zmm4
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm4, %zmm2
+; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    vpsllq %xmm0, %zmm3, %zmm0
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq %rdx, 24(%rdi)
-; AVX512VL-NEXT:    movq %r10, 16(%rdi)
-; AVX512VL-NEXT:    movq %r9, 8(%rdi)
-; AVX512VL-NEXT:    movq %r8, (%rdi)
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, %ymm0
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm2, %ymm1
+; AVX512VL-NEXT:    vpor %ymm1, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: fshl_rot_i256_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    movq %rdx, %rcx
-; AVX512VBMI-NEXT:    movq 16(%rsi), %rdx
-; AVX512VBMI-NEXT:    movq (%rsi), %r8
-; AVX512VBMI-NEXT:    movq 8(%rsi), %rax
-; AVX512VBMI-NEXT:    movq 24(%rsi), %rsi
-; AVX512VBMI-NEXT:    testb $-128, %cl
-; AVX512VBMI-NEXT:    movq %rsi, %r9
-; AVX512VBMI-NEXT:    cmovneq %rax, %r9
-; AVX512VBMI-NEXT:    movq %r8, %r10
-; AVX512VBMI-NEXT:    cmovneq %rdx, %r10
-; AVX512VBMI-NEXT:    cmovneq %rsi, %rax
-; AVX512VBMI-NEXT:    cmovneq %r8, %rdx
-; AVX512VBMI-NEXT:    testb $64, %cl
-; AVX512VBMI-NEXT:    movq %rdx, %rsi
-; AVX512VBMI-NEXT:    cmovneq %rax, %rsi
-; AVX512VBMI-NEXT:    cmovneq %r10, %rax
-; AVX512VBMI-NEXT:    cmoveq %r9, %rdx
-; AVX512VBMI-NEXT:    cmovneq %r9, %r10
-; AVX512VBMI-NEXT:    movq %r10, %r8
-; AVX512VBMI-NEXT:    shldq %cl, %rdx, %r8
-; AVX512VBMI-NEXT:    movq %rax, %r9
-; AVX512VBMI-NEXT:    shldq %cl, %r10, %r9
-; AVX512VBMI-NEXT:    movq %rsi, %r10
-; AVX512VBMI-NEXT:    shldq %cl, %rax, %r10
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shldq %cl, %rsi, %rdx
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq %rdx, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, (%rdi)
+; AVX512VBMI-NEXT:    movzbl %dl, %ecx
+; AVX512VBMI-NEXT:    movl %ecx, %edx
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    movl $-1, %edi
+; AVX512VBMI-NEXT:    shlxl %edx, %edi, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpexpandq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm2[7],zmm0[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT:    vpshldvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, (%rax)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i256, ptr %p0
   %r = call i256 @llvm.fshl.i256(i256 %a0, i256 %a0, i256 %a2)
@@ -936,101 +900,67 @@ define i256 @fshr_rot_i256_load(ptr %p0, i256 %a2) nounwind {
 ;
 ; AVX512F-LABEL: fshr_rot_i256_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    movq %rdx, %rcx
-; AVX512F-NEXT:    movq 16(%rsi), %r8
-; AVX512F-NEXT:    movq (%rsi), %r9
-; AVX512F-NEXT:    movq 8(%rsi), %rdx
-; AVX512F-NEXT:    movq 24(%rsi), %rax
-; AVX512F-NEXT:    testb %cl, %cl
-; AVX512F-NEXT:    movq %rax, %r10
-; AVX512F-NEXT:    cmovnsq %rdx, %r10
-; AVX512F-NEXT:    movq %r9, %rsi
-; AVX512F-NEXT:    cmovnsq %r8, %rsi
-; AVX512F-NEXT:    cmovnsq %rax, %rdx
+; AVX512F-NEXT:    movzbl %dl, %eax
+; AVX512F-NEXT:    vmovq %rax, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512F-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    movl $-1, %ecx
+; AVX512F-NEXT:    shlxl %eax, %ecx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512F-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm2, %ymm3, %ymm2
+; AVX512F-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7,0]
+; AVX512F-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; AVX512F-NEXT:    vpsllq %xmm0, %ymm1, %ymm0
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    cmovnsq %r9, %r8
-; AVX512F-NEXT:    testb $64, %cl
-; AVX512F-NEXT:    movq %r8, %rdi
-; AVX512F-NEXT:    cmoveq %rdx, %rdi
-; AVX512F-NEXT:    cmoveq %rsi, %rdx
-; AVX512F-NEXT:    cmoveq %r10, %rsi
-; AVX512F-NEXT:    cmovneq %r10, %r8
-; AVX512F-NEXT:    movq %r8, %r9
-; AVX512F-NEXT:    shrdq %cl, %rsi, %r9
-; AVX512F-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512F-NEXT:    shrdq %cl, %rdi, %rdx
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shrdq %cl, %r8, %rdi
-; AVX512F-NEXT:    movq %rdi, 24(%rax)
-; AVX512F-NEXT:    movq %rdx, 16(%rax)
-; AVX512F-NEXT:    movq %rsi, 8(%rax)
-; AVX512F-NEXT:    movq %r9, (%rax)
+; AVX512F-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512F-NEXT:    vmovdqu %ymm0, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fshr_rot_i256_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    movq %rdx, %rcx
-; AVX512VL-NEXT:    movq 16(%rsi), %rax
-; AVX512VL-NEXT:    movq (%rsi), %r8
-; AVX512VL-NEXT:    movq 8(%rsi), %rdx
-; AVX512VL-NEXT:    movq 24(%rsi), %r9
-; AVX512VL-NEXT:    testb %cl, %cl
-; AVX512VL-NEXT:    movq %r9, %r10
-; AVX512VL-NEXT:    cmovnsq %rdx, %r10
-; AVX512VL-NEXT:    movq %r8, %rsi
-; AVX512VL-NEXT:    cmovnsq %rax, %rsi
-; AVX512VL-NEXT:    cmovnsq %r9, %rdx
-; AVX512VL-NEXT:    cmovnsq %r8, %rax
-; AVX512VL-NEXT:    testb $64, %cl
-; AVX512VL-NEXT:    movq %rax, %r8
-; AVX512VL-NEXT:    cmoveq %rdx, %r8
-; AVX512VL-NEXT:    cmoveq %rsi, %rdx
-; AVX512VL-NEXT:    cmoveq %r10, %rsi
-; AVX512VL-NEXT:    cmovneq %r10, %rax
-; AVX512VL-NEXT:    movq %rax, %r9
-; AVX512VL-NEXT:    shrdq %cl, %rsi, %r9
-; AVX512VL-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VL-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shrdq %cl, %rax, %r8
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq %r8, 24(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VL-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VL-NEXT:    movq %r9, (%rdi)
+; AVX512VL-NEXT:    movzbl %dl, %ecx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm0
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm1, %xmm0, %xmm2
+; AVX512VL-NEXT:    shrl $6, %ecx
+; AVX512VL-NEXT:    movl $-1, %edx
+; AVX512VL-NEXT:    shlxl %ecx, %edx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vbroadcasti64x4 {{.*#+}} zmm3 = mem[0,1,2,3,0,1,2,3]
+; AVX512VL-NEXT:    vpcompressq %zmm3, %zmm3 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm2, %ymm3, %ymm2
+; AVX512VL-NEXT:    vpandn %xmm1, %xmm0, %xmm0
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm3[1,2,3,4,5,6,7,0]
+; AVX512VL-NEXT:    vpaddq %ymm1, %ymm1, %ymm1
+; AVX512VL-NEXT:    vpsllq %xmm0, %ymm1, %ymm0
+; AVX512VL-NEXT:    vpor %ymm2, %ymm0, %ymm0
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: fshr_rot_i256_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    movq %rdx, %rcx
-; AVX512VBMI-NEXT:    movq 16(%rsi), %rax
-; AVX512VBMI-NEXT:    movq (%rsi), %r8
-; AVX512VBMI-NEXT:    movq 8(%rsi), %rdx
-; AVX512VBMI-NEXT:    movq 24(%rsi), %r9
-; AVX512VBMI-NEXT:    testb %cl, %cl
-; AVX512VBMI-NEXT:    movq %r9, %r10
-; AVX512VBMI-NEXT:    cmovnsq %rdx, %r10
-; AVX512VBMI-NEXT:    movq %r8, %rsi
-; AVX512VBMI-NEXT:    cmovnsq %rax, %rsi
-; AVX512VBMI-NEXT:    cmovnsq %r9, %rdx
-; AVX512VBMI-NEXT:    cmovnsq %r8, %rax
-; AVX512VBMI-NEXT:    testb $64, %cl
-; AVX512VBMI-NEXT:    movq %rax, %r8
-; AVX512VBMI-NEXT:    cmoveq %rdx, %r8
-; AVX512VBMI-NEXT:    cmoveq %rsi, %rdx
-; AVX512VBMI-NEXT:    cmoveq %r10, %rsi
-; AVX512VBMI-NEXT:    cmovneq %r10, %rax
-; AVX512VBMI-NEXT:    movq %rax, %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %rsi, %r9
-; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %rsi
-; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rdx
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shrdq %cl, %rax, %r8
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq %r8, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %r9, (%rdi)
+; AVX512VBMI-NEXT:    movzbl %dl, %ecx
+; AVX512VBMI-NEXT:    movl %ecx, %edx
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    movl $-1, %edi
+; AVX512VBMI-NEXT:    shlxl %edx, %edi, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vbroadcasti64x4 {{.*#+}} zmm0 = mem[0,1,2,3,0,1,2,3]
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT:    vpxor %xmm2, %xmm2, %xmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm2 = zmm0[1,2,3,4,5,6,7],zmm2[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm2, %zmm0
+; AVX512VBMI-NEXT:    vmovdqu %ymm0, (%rax)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i256, ptr %p0
   %r = call i256 @llvm.fshr.i256(i256 %a0, i256 %a0, i256 %a2)
diff --git a/llvm/test/CodeGen/X86/funnel-shift-i512.ll b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
index c6105d2170cf1..9edd8bb7c2f12 100644
--- a/llvm/test/CodeGen/X86/funnel-shift-i512.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift-i512.ll
@@ -1837,161 +1837,127 @@ define i512 @fshl_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
 ;
 ; AVX512F-LABEL: fshl_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $136, %rsp
-; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm0
-; AVX512F-NEXT:    vmovups (%rdx), %zmm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovups %zmm1, (%rsp)
-; AVX512F-NEXT:    vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    vmovdqu64 %zmm0, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm1
+; AVX512F-NEXT:    vmovdqu64 (%rdx), %zmm2
+; AVX512F-NEXT:    andl $511, %ecx # imm = 0x1FF
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm3
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT:    vpand %xmm0, %xmm3, %xmm4
 ; AVX512F-NEXT:    movl %ecx, %edx
-; AVX512F-NEXT:    andl $511, %edx # imm = 0x1FF
-; AVX512F-NEXT:    movl $512, %esi # imm = 0x200
-; AVX512F-NEXT:    subq %rdx, %rsi
-; AVX512F-NEXT:    movl %esi, %edi
-; AVX512F-NEXT:    andl $63, %edi
-; AVX512F-NEXT:    vmovq %rdi, %xmm1
-; AVX512F-NEXT:    vpbroadcastq %xmm1, %xmm1
-; AVX512F-NEXT:    shrl $3, %esi
-; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    vmovdqu64 (%rsp,%rsi), %zmm3
-; AVX512F-NEXT:    vpsrlq %xmm1, %zmm3, %zmm4
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [63,63]
-; AVX512F-NEXT:    vpandn %xmm5, %xmm1, %xmm1
-; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm3[1,2,3,4,5,6,7],zmm2[0]
-; AVX512F-NEXT:    vpaddq %zmm3, %zmm3, %zmm3
-; AVX512F-NEXT:    vpsllq %xmm1, %zmm3, %zmm1
-; AVX512F-NEXT:    vporq %zmm4, %zmm1, %zmm1
-; AVX512F-NEXT:    movl %ecx, %esi
-; AVX512F-NEXT:    andl $63, %esi
-; AVX512F-NEXT:    vmovq %rsi, %xmm3
-; AVX512F-NEXT:    vpbroadcastq %xmm3, %xmm3
-; AVX512F-NEXT:    vpandn %xmm5, %xmm3, %xmm4
-; AVX512F-NEXT:    shrl $3, %ecx
-; AVX512F-NEXT:    andl $56, %ecx
-; AVX512F-NEXT:    leaq -{{[0-9]+}}(%rsp), %rsi
-; AVX512F-NEXT:    subq %rcx, %rsi
-; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm5
-; AVX512F-NEXT:    valignq {{.*#+}} zmm2 = zmm2[7],zmm5[0,1,2,3,4,5,6]
-; AVX512F-NEXT:    vpsrlq $1, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsrlq %xmm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsllq %xmm3, %zmm5, %zmm3
-; AVX512F-NEXT:    xorl %ecx, %ecx
-; AVX512F-NEXT:    negq %rdx
-; AVX512F-NEXT:    sbbl %ecx, %ecx
-; AVX512F-NEXT:    vporq %zmm2, %zmm3, %zmm2
-; AVX512F-NEXT:    kmovw %ecx, %k1
-; AVX512F-NEXT:    vporq %zmm1, %zmm2, %zmm0 {%k1}
-; AVX512F-NEXT:    vmovdqu64 %zmm0, (%rax)
-; AVX512F-NEXT:    addq $136, %rsp
+; AVX512F-NEXT:    shrl $6, %edx
+; AVX512F-NEXT:    movl $-1, %esi
+; AVX512F-NEXT:    shlxl %edx, %esi, %edx
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    vpexpandq %zmm1, %zmm5 {%k1} {z}
+; AVX512F-NEXT:    vpsllq %xmm4, %zmm5, %zmm4
+; AVX512F-NEXT:    vpandn %xmm0, %xmm3, %xmm3
+; AVX512F-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512F-NEXT:    valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
+; AVX512F-NEXT:    vpsrlq $1, %zmm5, %zmm5
+; AVX512F-NEXT:    vpsrlq %xmm3, %zmm5, %zmm3
+; AVX512F-NEXT:    vporq %zmm3, %zmm4, %zmm3
+; AVX512F-NEXT:    movl $512, %edx # imm = 0x200
+; AVX512F-NEXT:    subq %rcx, %rdx
+; AVX512F-NEXT:    vmovq %rdx, %xmm4
+; AVX512F-NEXT:    vpbroadcastq %xmm4, %xmm4
+; AVX512F-NEXT:    vpand %xmm0, %xmm4, %xmm5
+; AVX512F-NEXT:    shrl $6, %edx
+; AVX512F-NEXT:    shlxl %edx, %esi, %edx
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    vpcompressq %zmm2, %zmm2 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm5, %zmm2, %zmm5
+; AVX512F-NEXT:    vpandn %xmm0, %xmm4, %xmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7],zmm6[0]
+; AVX512F-NEXT:    vpaddq %zmm2, %zmm2, %zmm2
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm2, %zmm0
+; AVX512F-NEXT:    xorl %edx, %edx
+; AVX512F-NEXT:    negq %rcx
+; AVX512F-NEXT:    sbbl %edx, %edx
+; AVX512F-NEXT:    vporq %zmm5, %zmm0, %zmm0
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    vporq %zmm0, %zmm3, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fshl_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    subq $136, %rsp
 ; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
-; AVX512VL-NEXT:    vmovups 32(%rsi), %ymm1
-; AVX512VL-NEXT:    vmovups (%rdx), %ymm2
-; AVX512VL-NEXT:    vmovups 32(%rdx), %ymm3
-; AVX512VL-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm3, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, (%rsp)
-; AVX512VL-NEXT:    vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    movl %ecx, %edx
-; AVX512VL-NEXT:    andl $511, %edx # imm = 0x1FF
-; AVX512VL-NEXT:    movl $512, %eax # imm = 0x200
-; AVX512VL-NEXT:    subq %rdx, %rax
-; AVX512VL-NEXT:    movl %eax, %esi
-; AVX512VL-NEXT:    andl $63, %esi
-; AVX512VL-NEXT:    vpbroadcastq %rsi, %xmm1
-; AVX512VL-NEXT:    shrl $3, %eax
-; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    vmovdqu64 (%rsp,%rax), %zmm2
-; AVX512VL-NEXT:    vpsrlq %xmm1, %zmm2, %zmm3
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [63,63]
-; AVX512VL-NEXT:    vpandn %xmm4, %xmm1, %xmm1
-; AVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm2 = zmm2[1,2,3,4,5,6,7],zmm5[0]
-; AVX512VL-NEXT:    vpaddq %zmm2, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpsllq %xmm1, %zmm2, %zmm1
+; AVX512VL-NEXT:    vmovdqu64 (%rdx), %zmm1
+; AVX512VL-NEXT:    andl $511, %ecx # imm = 0x1FF
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm2
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm3 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm3, %xmm2, %xmm4
+; AVX512VL-NEXT:    movl %ecx, %eax
+; AVX512VL-NEXT:    shrl $6, %eax
+; AVX512VL-NEXT:    movl $-1, %edx
+; AVX512VL-NEXT:    shlxl %eax, %edx, %eax
+; AVX512VL-NEXT:    kmovd %eax, %k1
+; AVX512VL-NEXT:    vpexpandq %zmm0, %zmm5 {%k1} {z}
+; AVX512VL-NEXT:    vpsllq %xmm4, %zmm5, %zmm4
+; AVX512VL-NEXT:    vpandn %xmm3, %xmm2, %xmm2
+; AVX512VL-NEXT:    vpxor %xmm6, %xmm6, %xmm6
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm5 = zmm6[7],zmm5[0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpsrlq $1, %zmm5, %zmm5
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm5, %zmm2
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    vporq %zmm3, %zmm1, %zmm1
-; AVX512VL-NEXT:    movl %ecx, %esi
-; AVX512VL-NEXT:    andl $63, %esi
-; AVX512VL-NEXT:    vpbroadcastq %rsi, %xmm2
-; AVX512VL-NEXT:    vpandn %xmm4, %xmm2, %xmm3
-; AVX512VL-NEXT:    shrl $3, %ecx
-; AVX512VL-NEXT:    andl $56, %ecx
-; AVX512VL-NEXT:    leaq -{{[0-9]+}}(%rsp), %rsi
+; AVX512VL-NEXT:    vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT:    movl $512, %esi # imm = 0x200
 ; AVX512VL-NEXT:    subq %rcx, %rsi
-; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm4
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm5 = zmm5[7],zmm4[0,1,2,3,4,5,6]
-; AVX512VL-NEXT:    vpsrlq $1, %zmm5, %zmm5
-; AVX512VL-NEXT:    vpsrlq %xmm3, %zmm5, %zmm3
-; AVX512VL-NEXT:    vpsllq %xmm2, %zmm4, %zmm2
-; AVX512VL-NEXT:    vporq %zmm3, %zmm2, %zmm2
-; AVX512VL-NEXT:    xorl %ecx, %ecx
-; AVX512VL-NEXT:    negq %rdx
-; AVX512VL-NEXT:    sbbl %ecx, %ecx
-; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpbroadcastq %rsi, %xmm4
+; AVX512VL-NEXT:    vpand %xmm3, %xmm4, %xmm5
+; AVX512VL-NEXT:    shrl $6, %esi
+; AVX512VL-NEXT:    shlxl %esi, %edx, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm5, %zmm1, %zmm5
+; AVX512VL-NEXT:    vpandn %xmm3, %xmm4, %xmm3
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm1 = zmm1[1,2,3,4,5,6,7],zmm6[0]
+; AVX512VL-NEXT:    vpaddq %zmm1, %zmm1, %zmm1
+; AVX512VL-NEXT:    vpsllq %xmm3, %zmm1, %zmm1
+; AVX512VL-NEXT:    vporq %zmm5, %zmm1, %zmm1
+; AVX512VL-NEXT:    xorl %edx, %edx
+; AVX512VL-NEXT:    negq %rcx
+; AVX512VL-NEXT:    sbbl %edx, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
 ; AVX512VL-NEXT:    vporq %zmm1, %zmm2, %zmm0 {%k1}
 ; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
-; AVX512VL-NEXT:    addq $136, %rsp
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: fshl_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    subq $136, %rsp
-; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
-; AVX512VBMI-NEXT:    vmovups 32(%rsi), %ymm1
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    vmovups (%rdx), %ymm2
-; AVX512VBMI-NEXT:    vmovups 32(%rdx), %ymm3
-; AVX512VBMI-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm3, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, (%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %ecx, %esi
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    vmovdqu64 (%rdx), %zmm1
+; AVX512VBMI-NEXT:    andl $511, %ecx # imm = 0x1FF
+; AVX512VBMI-NEXT:    movl $512, %edx # imm = 0x200
+; AVX512VBMI-NEXT:    subq %rcx, %rdx
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm2
+; AVX512VBMI-NEXT:    # kill: def $edx killed $edx killed $rdx
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    movl $-1, %esi
+; AVX512VBMI-NEXT:    shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm1, %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm4 = zmm1[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT:    vpshrdvq %zmm2, %zmm4, %zmm1
 ; AVX512VBMI-NEXT:    movl %ecx, %edx
-; AVX512VBMI-NEXT:    andl $511, %edx # imm = 0x1FF
-; AVX512VBMI-NEXT:    movl $512, %edi # imm = 0x200
-; AVX512VBMI-NEXT:    subq %rdx, %rdi
-; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm1
-; AVX512VBMI-NEXT:    # kill: def $edi killed $edi killed $rdi def $rdi
-; AVX512VBMI-NEXT:    shrl $3, %edi
-; AVX512VBMI-NEXT:    andl $56, %edi
-; AVX512VBMI-NEXT:    vmovdqu64 (%rsp,%rdi), %zmm2
-; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm3
-; AVX512VBMI-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm5 = zmm2[1,2,3,4,5,6,7],zmm4[0]
-; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm5, %zmm2
-; AVX512VBMI-NEXT:    shrl $3, %esi
-; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    leaq -{{[0-9]+}}(%rsp), %rcx
-; AVX512VBMI-NEXT:    subq %rsi, %rcx
-; AVX512VBMI-NEXT:    vmovdqu64 (%rcx), %zmm1
-; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm4 = zmm4[7],zmm1[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT:    vpshldvq %zmm3, %zmm4, %zmm1
-; AVX512VBMI-NEXT:    xorl %ecx, %ecx
-; AVX512VBMI-NEXT:    negq %rdx
-; AVX512VBMI-NEXT:    sbbl %ecx, %ecx
-; AVX512VBMI-NEXT:    kmovd %ecx, %k1
-; AVX512VBMI-NEXT:    vporq %zmm2, %zmm1, %zmm0 {%k1}
-; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
-; AVX512VBMI-NEXT:    addq $136, %rsp
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    shlxl %edx, %esi, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm3 = zmm3[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT:    vpshldvq %zmm4, %zmm3, %zmm2
+; AVX512VBMI-NEXT:    xorl %edx, %edx
+; AVX512VBMI-NEXT:    negq %rcx
+; AVX512VBMI-NEXT:    sbbl %edx, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rdi)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -2163,161 +2129,124 @@ define i512 @fshr_i512_load(ptr %p0, ptr %p1, i512 %a2) nounwind {
 ;
 ; AVX512F-LABEL: fshr_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    subq $136, %rsp
-; AVX512F-NEXT:    vmovups (%rsi), %zmm0
 ; AVX512F-NEXT:    vmovdqu64 (%rdx), %zmm1
-; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
-; AVX512F-NEXT:    vmovdqu64 %zmm2, (%rsp)
-; AVX512F-NEXT:    vmovups %zmm0, {{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    vmovdqu64 %zmm2, -{{[0-9]+}}(%rsp)
+; AVX512F-NEXT:    andl $511, %ecx # imm = 0x1FF
+; AVX512F-NEXT:    vmovq %rcx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT:    vpand %xmm0, %xmm2, %xmm3
+; AVX512F-NEXT:    movl %ecx, %eax
+; AVX512F-NEXT:    shrl $6, %eax
+; AVX512F-NEXT:    movl $-1, %edx
+; AVX512F-NEXT:    shlxl %eax, %edx, %eax
+; AVX512F-NEXT:    kmovw %eax, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm4 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm3, %zmm4, %zmm3
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    vmovdqu64 %zmm1, -{{[0-9]+}}(%rsp)
-; AVX512F-NEXT:    movl %ecx, %edx
-; AVX512F-NEXT:    andl $511, %edx # imm = 0x1FF
-; AVX512F-NEXT:    movl $512, %esi # imm = 0x200
-; AVX512F-NEXT:    subq %rdx, %rsi
-; AVX512F-NEXT:    movl %esi, %edi
-; AVX512F-NEXT:    andl $63, %edi
-; AVX512F-NEXT:    vmovq %rdi, %xmm0
-; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm0
-; AVX512F-NEXT:    shrl $3, %esi
-; AVX512F-NEXT:    andl $56, %esi
-; AVX512F-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX512F-NEXT:    subq %rsi, %rdi
-; AVX512F-NEXT:    vmovdqu64 (%rdi), %zmm3
-; AVX512F-NEXT:    vpsllq %xmm0, %zmm3, %zmm4
-; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm5 = [63,63]
-; AVX512F-NEXT:    vpandn %xmm5, %xmm0, %xmm0
-; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm2[7],zmm3[0,1,2,3,4,5,6]
+; AVX512F-NEXT:    vpandn %xmm0, %xmm2, %xmm2
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT:    vpaddq %zmm4, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsllq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    movl $512, %edi # imm = 0x200
+; AVX512F-NEXT:    subq %rcx, %rdi
+; AVX512F-NEXT:    vmovq %rdi, %xmm3
+; AVX512F-NEXT:    vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT:    shrl $6, %edi
+; AVX512F-NEXT:    shlxl %edi, %edx, %edx
+; AVX512F-NEXT:    kmovw %edx, %k1
+; AVX512F-NEXT:    vpexpandq (%rsi), %zmm4 {%k1} {z}
+; AVX512F-NEXT:    vpand %xmm0, %xmm3, %xmm6
+; AVX512F-NEXT:    vpsllq %xmm6, %zmm4, %zmm6
+; AVX512F-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm5[7],zmm4[0,1,2,3,4,5,6]
 ; AVX512F-NEXT:    vpsrlq $1, %zmm3, %zmm3
 ; AVX512F-NEXT:    vpsrlq %xmm0, %zmm3, %zmm0
-; AVX512F-NEXT:    vporq %zmm0, %zmm4, %zmm0
-; AVX512F-NEXT:    movl %ecx, %esi
-; AVX512F-NEXT:    andl $63, %esi
-; AVX512F-NEXT:    vmovq %rsi, %xmm3
-; AVX512F-NEXT:    vpbroadcastq %xmm3, %xmm3
-; AVX512F-NEXT:    vpandn %xmm5, %xmm3, %xmm4
-; AVX512F-NEXT:    shrl $3, %ecx
-; AVX512F-NEXT:    andl $56, %ecx
-; AVX512F-NEXT:    vmovdqu64 -128(%rsp,%rcx), %zmm5
-; AVX512F-NEXT:    valignq {{.*#+}} zmm2 = zmm5[1,2,3,4,5,6,7],zmm2[0]
-; AVX512F-NEXT:    vpaddq %zmm2, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsllq %xmm4, %zmm2, %zmm2
-; AVX512F-NEXT:    vpsrlq %xmm3, %zmm5, %zmm3
-; AVX512F-NEXT:    xorl %ecx, %ecx
-; AVX512F-NEXT:    negq %rdx
-; AVX512F-NEXT:    sbbl %ecx, %ecx
-; AVX512F-NEXT:    vporq %zmm3, %zmm2, %zmm2
-; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    xorl %edx, %edx
+; AVX512F-NEXT:    negq %rcx
+; AVX512F-NEXT:    sbbl %edx, %edx
+; AVX512F-NEXT:    vporq %zmm0, %zmm6, %zmm0
+; AVX512F-NEXT:    kmovw %edx, %k1
 ; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm1 {%k1}
 ; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rax)
-; AVX512F-NEXT:    addq $136, %rsp
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fshr_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    subq $136, %rsp
-; AVX512VL-NEXT:    vmovups (%rsi), %ymm1
-; AVX512VL-NEXT:    vmovups 32(%rsi), %ymm2
+; AVX512VL-NEXT:    movq %rdi, %rax
 ; AVX512VL-NEXT:    vmovdqu64 (%rdx), %zmm0
-; AVX512VL-NEXT:    vmovups 32(%rdx), %ymm3
-; AVX512VL-NEXT:    vxorps %xmm4, %xmm4, %xmm4
-; AVX512VL-NEXT:    vmovups %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm4, (%rsp)
-; AVX512VL-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VL-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
+; AVX512VL-NEXT:    andl $511, %ecx # imm = 0x1FF
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
 ; AVX512VL-NEXT:    movl %ecx, %edx
-; AVX512VL-NEXT:    andl $511, %edx # imm = 0x1FF
-; AVX512VL-NEXT:    movl $512, %eax # imm = 0x200
-; AVX512VL-NEXT:    subq %rdx, %rax
-; AVX512VL-NEXT:    movl %eax, %esi
-; AVX512VL-NEXT:    andl $63, %esi
-; AVX512VL-NEXT:    vpbroadcastq %rsi, %xmm1
-; AVX512VL-NEXT:    shrl $3, %eax
-; AVX512VL-NEXT:    andl $56, %eax
-; AVX512VL-NEXT:    leaq {{[0-9]+}}(%rsp), %rsi
-; AVX512VL-NEXT:    subq %rax, %rsi
-; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm2
-; AVX512VL-NEXT:    vpsllq %xmm1, %zmm2, %zmm3
-; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm4 = [63,63]
-; AVX512VL-NEXT:    vpandn %xmm4, %xmm1, %xmm1
+; AVX512VL-NEXT:    shrl $6, %edx
+; AVX512VL-NEXT:    movl $-1, %edi
+; AVX512VL-NEXT:    shlxl %edx, %edi, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
 ; AVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm2 = zmm5[7],zmm2[0,1,2,3,4,5,6]
-; AVX512VL-NEXT:    vpsrlq $1, %zmm2, %zmm2
-; AVX512VL-NEXT:    vpsrlq %xmm1, %zmm2, %zmm1
-; AVX512VL-NEXT:    vporq %zmm1, %zmm3, %zmm1
-; AVX512VL-NEXT:    movl %ecx, %esi
-; AVX512VL-NEXT:    andl $63, %esi
-; AVX512VL-NEXT:    vpbroadcastq %rsi, %xmm2
-; AVX512VL-NEXT:    vpandn %xmm4, %xmm2, %xmm3
-; AVX512VL-NEXT:    shrl $3, %ecx
-; AVX512VL-NEXT:    andl $56, %ecx
-; AVX512VL-NEXT:    vmovdqu64 -128(%rsp,%rcx), %zmm4
-; AVX512VL-NEXT:    valignq {{.*#+}} zmm5 = zmm4[1,2,3,4,5,6,7],zmm5[0]
-; AVX512VL-NEXT:    vpaddq %zmm5, %zmm5, %zmm5
-; AVX512VL-NEXT:    vpsllq %xmm3, %zmm5, %zmm3
-; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm4, %zmm2
-; AVX512VL-NEXT:    vporq %zmm2, %zmm3, %zmm2
-; AVX512VL-NEXT:    xorl %ecx, %ecx
-; AVX512VL-NEXT:    negq %rdx
-; AVX512VL-NEXT:    sbbl %ecx, %ecx
-; AVX512VL-NEXT:    kmovd %ecx, %k1
-; AVX512VL-NEXT:    vporq %zmm2, %zmm1, %zmm0 {%k1}
-; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rdi)
-; AVX512VL-NEXT:    addq $136, %rsp
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT:    vpaddq %zmm4, %zmm4, %zmm4
+; AVX512VL-NEXT:    vpsllq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT:    vporq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT:    movl $512, %edx # imm = 0x200
+; AVX512VL-NEXT:    subq %rcx, %rdx
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %xmm3
+; AVX512VL-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT:    shrl $6, %edx
+; AVX512VL-NEXT:    shlxl %edx, %edi, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vpexpandq (%rsi), %zmm6 {%k1} {z}
+; AVX512VL-NEXT:    vpsllq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT:    vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpsrlq $1, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT:    xorl %edx, %edx
+; AVX512VL-NEXT:    negq %rcx
+; AVX512VL-NEXT:    sbbl %edx, %edx
+; AVX512VL-NEXT:    kmovd %edx, %k1
+; AVX512VL-NEXT:    vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT:    vmovdqu64 %zmm0, (%rax)
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: fshr_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    subq $136, %rsp
-; AVX512VBMI-NEXT:    vmovups (%rsi), %ymm1
-; AVX512VBMI-NEXT:    vmovups 32(%rsi), %ymm2
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    vmovdqu64 (%rdx), %zmm0
-; AVX512VBMI-NEXT:    vmovups 32(%rdx), %ymm3
-; AVX512VBMI-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, (%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm2, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm1, {{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm4, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovups %ymm3, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    vmovdqu %ymm0, -{{[0-9]+}}(%rsp)
-; AVX512VBMI-NEXT:    movl %ecx, %edx
-; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm1
-; AVX512VBMI-NEXT:    # kill: def $ecx killed $ecx killed $rcx def $rcx
 ; AVX512VBMI-NEXT:    andl $511, %ecx # imm = 0x1FF
-; AVX512VBMI-NEXT:    movl $512, %esi # imm = 0x200
-; AVX512VBMI-NEXT:    subq %rcx, %rsi
-; AVX512VBMI-NEXT:    vpbroadcastq %rsi, %zmm2
-; AVX512VBMI-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
-; AVX512VBMI-NEXT:    shrl $3, %esi
-; AVX512VBMI-NEXT:    andl $56, %esi
-; AVX512VBMI-NEXT:    leaq {{[0-9]+}}(%rsp), %rdi
-; AVX512VBMI-NEXT:    subq %rsi, %rdi
-; AVX512VBMI-NEXT:    vmovdqu64 (%rdi), %zmm3
-; AVX512VBMI-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm5 = zmm4[7],zmm3[0,1,2,3,4,5,6]
-; AVX512VBMI-NEXT:    vpshldvq %zmm2, %zmm5, %zmm3
-; AVX512VBMI-NEXT:    shrl $3, %edx
-; AVX512VBMI-NEXT:    andl $56, %edx
-; AVX512VBMI-NEXT:    vmovdqu64 -128(%rsp,%rdx), %zmm2
-; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm4[0]
-; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT:    movl $512, %edi # imm = 0x200
+; AVX512VBMI-NEXT:    subq %rcx, %rdi
+; AVX512VBMI-NEXT:    vpbroadcastq %rdi, %zmm0
+; AVX512VBMI-NEXT:    # kill: def $edi killed $edi killed $rdi
+; AVX512VBMI-NEXT:    shrl $6, %edi
+; AVX512VBMI-NEXT:    movl $-1, %r8d
+; AVX512VBMI-NEXT:    shlxl %edi, %r8d, %edi
+; AVX512VBMI-NEXT:    kmovd %edi, %k1
+; AVX512VBMI-NEXT:    vpexpandq (%rsi), %zmm1 {%k1} {z}
+; AVX512VBMI-NEXT:    vmovdqu64 (%rdx), %zmm2
+; AVX512VBMI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm4 = zmm3[7],zmm1[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT:    vpshldvq %zmm0, %zmm4, %zmm1
+; AVX512VBMI-NEXT:    movl %ecx, %edx
+; AVX512VBMI-NEXT:    shrl $6, %edx
+; AVX512VBMI-NEXT:    shlxl %edx, %r8d, %edx
+; AVX512VBMI-NEXT:    kmovd %edx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm2, %zmm0 {%k1} {z}
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm3 = zmm0[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm4
+; AVX512VBMI-NEXT:    vpshrdvq %zmm4, %zmm3, %zmm0
 ; AVX512VBMI-NEXT:    xorl %edx, %edx
 ; AVX512VBMI-NEXT:    negq %rcx
 ; AVX512VBMI-NEXT:    sbbl %edx, %edx
 ; AVX512VBMI-NEXT:    kmovd %edx, %k1
-; AVX512VBMI-NEXT:    vporq %zmm2, %zmm3, %zmm0 {%k1}
-; AVX512VBMI-NEXT:    vmovdqu64 %zmm0, (%rax)
-; AVX512VBMI-NEXT:    addq $136, %rsp
+; AVX512VBMI-NEXT:    vporq %zmm0, %zmm1, %zmm2 {%k1}
+; AVX512VBMI-NEXT:    vmovdqu64 %zmm2, (%rax)
 ; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
@@ -2493,248 +2422,127 @@ define i512 @fshl_rot_i512_load(ptr %p0, i512 %a2) nounwind {
 ;
 ; AVX512F-LABEL: fshl_rot_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
-; AVX512F-NEXT:    pushq %r12
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq %rdx, %rcx
-; AVX512F-NEXT:    movq 40(%rsi), %r8
-; AVX512F-NEXT:    movq 48(%rsi), %rdx
-; AVX512F-NEXT:    movq 16(%rsi), %rax
-; AVX512F-NEXT:    movq 56(%rsi), %r11
-; AVX512F-NEXT:    movq 24(%rsi), %rbx
-; AVX512F-NEXT:    movq (%rsi), %r10
-; AVX512F-NEXT:    movq 8(%rsi), %r14
-; AVX512F-NEXT:    movq 32(%rsi), %r12
-; AVX512F-NEXT:    testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT:    movq %r12, %r15
-; AVX512F-NEXT:    cmovneq %r10, %r15
-; AVX512F-NEXT:    movq %rbx, %r13
-; AVX512F-NEXT:    cmovneq %r11, %r13
-; AVX512F-NEXT:    movq %rax, %rsi
-; AVX512F-NEXT:    cmovneq %rdx, %rsi
-; AVX512F-NEXT:    movq %r14, %r9
-; AVX512F-NEXT:    cmovneq %r8, %r9
-; AVX512F-NEXT:    cmovneq %r12, %r10
-; AVX512F-NEXT:    cmovneq %rax, %rdx
-; AVX512F-NEXT:    cmovneq %r14, %r8
-; AVX512F-NEXT:    cmovneq %rbx, %r11
-; AVX512F-NEXT:    testb $-128, %cl
-; AVX512F-NEXT:    movq %r9, %r14
-; AVX512F-NEXT:    cmovneq %r11, %r14
-; AVX512F-NEXT:    cmovneq %r8, %r11
-; AVX512F-NEXT:    movq %rsi, %rbx
-; AVX512F-NEXT:    cmovneq %r10, %rbx
-; AVX512F-NEXT:    cmovneq %rdx, %r10
-; AVX512F-NEXT:    cmoveq %r13, %r9
-; AVX512F-NEXT:    cmoveq %r15, %rsi
-; AVX512F-NEXT:    cmovneq %r13, %r8
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    cmovneq %r15, %rdx
-; AVX512F-NEXT:    testb $64, %cl
-; AVX512F-NEXT:    movq %rdx, %rdi
-; AVX512F-NEXT:    cmovneq %r8, %rdi
-; AVX512F-NEXT:    cmovneq %rsi, %r8
-; AVX512F-NEXT:    cmovneq %r9, %rsi
-; AVX512F-NEXT:    cmovneq %rbx, %r9
-; AVX512F-NEXT:    cmovneq %r14, %rbx
-; AVX512F-NEXT:    cmovneq %r10, %r14
-; AVX512F-NEXT:    cmoveq %r11, %rdx
-; AVX512F-NEXT:    cmovneq %r11, %r10
-; AVX512F-NEXT:    movq %r10, %r11
-; AVX512F-NEXT:    shldq %cl, %rdx, %r11
-; AVX512F-NEXT:    movq %r14, %r15
-; AVX512F-NEXT:    shldq %cl, %r10, %r15
-; AVX512F-NEXT:    movq %rbx, %r10
-; AVX512F-NEXT:    shldq %cl, %r14, %r10
-; AVX512F-NEXT:    movq %r9, %r14
-; AVX512F-NEXT:    shldq %cl, %rbx, %r14
-; AVX512F-NEXT:    movq %rsi, %rbx
-; AVX512F-NEXT:    shldq %cl, %r9, %rbx
-; AVX512F-NEXT:    movq %r8, %r9
-; AVX512F-NEXT:    shldq %cl, %rsi, %r9
-; AVX512F-NEXT:    movq %rdi, %rsi
-; AVX512F-NEXT:    shldq %cl, %r8, %rsi
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shldq %cl, %rdi, %rdx
-; AVX512F-NEXT:    movq %rdx, 56(%rax)
-; AVX512F-NEXT:    movq %rsi, 48(%rax)
-; AVX512F-NEXT:    movq %r9, 40(%rax)
-; AVX512F-NEXT:    movq %rbx, 32(%rax)
-; AVX512F-NEXT:    movq %r14, 24(%rax)
-; AVX512F-NEXT:    movq %r10, 16(%rax)
-; AVX512F-NEXT:    movq %r15, 8(%rax)
-; AVX512F-NEXT:    movq %r11, (%rax)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm1
+; AVX512F-NEXT:    andl $511, %edx # imm = 0x1FF
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT:    vpand %xmm0, %xmm2, %xmm3
+; AVX512F-NEXT:    movl %edx, %ecx
+; AVX512F-NEXT:    shrl $6, %ecx
+; AVX512F-NEXT:    movl $-1, %esi
+; AVX512F-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpexpandq %zmm1, %zmm4 {%k1} {z}
+; AVX512F-NEXT:    vpsllq %xmm3, %zmm4, %zmm3
+; AVX512F-NEXT:    vpandn %xmm0, %xmm2, %xmm2
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    valignq {{.*#+}} zmm4 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512F-NEXT:    vpsrlq $1, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsrlq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vporq %zmm2, %zmm3, %zmm2
+; AVX512F-NEXT:    movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT:    subq %rdx, %rcx
+; AVX512F-NEXT:    vmovq %rcx, %xmm3
+; AVX512F-NEXT:    vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT:    vpand %xmm0, %xmm3, %xmm4
+; AVX512F-NEXT:    shrl $6, %ecx
+; AVX512F-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm6 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm6[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT:    vpaddq %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT:    vpsllq %xmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    xorl %ecx, %ecx
+; AVX512F-NEXT:    negq %rdx
+; AVX512F-NEXT:    sbbl %ecx, %ecx
+; AVX512F-NEXT:    vporq %zmm4, %zmm0, %zmm0
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vporq %zmm0, %zmm2, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fshl_rot_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %r13
-; AVX512VL-NEXT:    pushq %r12
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    movq %rdx, %rcx
-; AVX512VL-NEXT:    movq 40(%rsi), %r8
-; AVX512VL-NEXT:    movq 48(%rsi), %rdx
-; AVX512VL-NEXT:    movq 16(%rsi), %rbx
-; AVX512VL-NEXT:    movq 56(%rsi), %r10
-; AVX512VL-NEXT:    movq 24(%rsi), %r14
-; AVX512VL-NEXT:    movq (%rsi), %r9
-; AVX512VL-NEXT:    movq 8(%rsi), %r15
-; AVX512VL-NEXT:    movq 32(%rsi), %r12
-; AVX512VL-NEXT:    testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT:    movq %r12, %r11
-; AVX512VL-NEXT:    cmovneq %r9, %r11
-; AVX512VL-NEXT:    movq %r14, %r13
-; AVX512VL-NEXT:    cmovneq %r10, %r13
-; AVX512VL-NEXT:    movq %rbx, %rax
-; AVX512VL-NEXT:    cmovneq %rdx, %rax
-; AVX512VL-NEXT:    movq %r15, %rsi
-; AVX512VL-NEXT:    cmovneq %r8, %rsi
-; AVX512VL-NEXT:    cmovneq %r12, %r9
-; AVX512VL-NEXT:    cmovneq %rbx, %rdx
-; AVX512VL-NEXT:    cmovneq %r15, %r8
-; AVX512VL-NEXT:    cmovneq %r14, %r10
-; AVX512VL-NEXT:    testb $-128, %cl
-; AVX512VL-NEXT:    movq %rsi, %r14
-; AVX512VL-NEXT:    cmovneq %r10, %r14
-; AVX512VL-NEXT:    cmovneq %r8, %r10
-; AVX512VL-NEXT:    movq %rax, %rbx
-; AVX512VL-NEXT:    cmovneq %r9, %rbx
-; AVX512VL-NEXT:    cmovneq %rdx, %r9
-; AVX512VL-NEXT:    cmoveq %r13, %rsi
-; AVX512VL-NEXT:    cmoveq %r11, %rax
-; AVX512VL-NEXT:    cmovneq %r13, %r8
-; AVX512VL-NEXT:    cmovneq %r11, %rdx
-; AVX512VL-NEXT:    testb $64, %cl
-; AVX512VL-NEXT:    movq %rdx, %r11
-; AVX512VL-NEXT:    cmovneq %r8, %r11
-; AVX512VL-NEXT:    cmovneq %rax, %r8
-; AVX512VL-NEXT:    cmovneq %rsi, %rax
-; AVX512VL-NEXT:    cmovneq %rbx, %rsi
-; AVX512VL-NEXT:    cmovneq %r14, %rbx
-; AVX512VL-NEXT:    cmovneq %r9, %r14
-; AVX512VL-NEXT:    cmoveq %r10, %rdx
-; AVX512VL-NEXT:    cmovneq %r10, %r9
-; AVX512VL-NEXT:    movq %r9, %r10
-; AVX512VL-NEXT:    shldq %cl, %rdx, %r10
-; AVX512VL-NEXT:    movq %r14, %r15
-; AVX512VL-NEXT:    shldq %cl, %r9, %r15
-; AVX512VL-NEXT:    movq %rbx, %r9
-; AVX512VL-NEXT:    shldq %cl, %r14, %r9
-; AVX512VL-NEXT:    movq %rsi, %r14
-; AVX512VL-NEXT:    shldq %cl, %rbx, %r14
-; AVX512VL-NEXT:    movq %rax, %rbx
-; AVX512VL-NEXT:    shldq %cl, %rsi, %rbx
-; AVX512VL-NEXT:    movq %r8, %rsi
-; AVX512VL-NEXT:    shldq %cl, %rax, %rsi
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    movq %r11, %rdi
-; AVX512VL-NEXT:    shldq %cl, %r8, %rdi
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VL-NEXT:    movq %rdx, 56(%rax)
-; AVX512VL-NEXT:    movq %rdi, 48(%rax)
-; AVX512VL-NEXT:    movq %rsi, 40(%rax)
-; AVX512VL-NEXT:    movq %rbx, 32(%rax)
-; AVX512VL-NEXT:    movq %r14, 24(%rax)
-; AVX512VL-NEXT:    movq %r9, 16(%rax)
-; AVX512VL-NEXT:    movq %r15, 8(%rax)
-; AVX512VL-NEXT:    movq %r10, (%rax)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r12
-; AVX512VL-NEXT:    popq %r13
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    andl $511, %edx # imm = 0x1FF
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %xmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    movl %edx, %ecx
+; AVX512VL-NEXT:    shrl $6, %ecx
+; AVX512VL-NEXT:    movl $-1, %esi
+; AVX512VL-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpexpandq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT:    vpsllq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm4 = zmm5[7],zmm4[0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpsrlq $1, %zmm4, %zmm4
+; AVX512VL-NEXT:    vpsrlq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT:    vporq %zmm1, %zmm3, %zmm1
+; AVX512VL-NEXT:    movl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT:    subq %rdx, %rcx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT:    shrl $6, %ecx
+; AVX512VL-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm6 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT:    vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm3 = zmm6[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT:    vpaddq %zmm3, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpsllq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vporq %zmm4, %zmm2, %zmm2
+; AVX512VL-NEXT:    xorl %ecx, %ecx
+; AVX512VL-NEXT:    negq %rdx
+; AVX512VL-NEXT:    sbbl %ecx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vporq %zmm2, %zmm1, %zmm0 {%k1}
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: fshl_rot_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %r13
-; AVX512VBMI-NEXT:    pushq %r12
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    movq %rdx, %rcx
-; AVX512VBMI-NEXT:    movq 40(%rsi), %r8
-; AVX512VBMI-NEXT:    movq 48(%rsi), %rdx
-; AVX512VBMI-NEXT:    movq 16(%rsi), %rbx
-; AVX512VBMI-NEXT:    movq 56(%rsi), %r10
-; AVX512VBMI-NEXT:    movq 24(%rsi), %r14
-; AVX512VBMI-NEXT:    movq (%rsi), %r9
-; AVX512VBMI-NEXT:    movq 8(%rsi), %r15
-; AVX512VBMI-NEXT:    movq 32(%rsi), %r12
-; AVX512VBMI-NEXT:    testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT:    movq %r12, %r11
-; AVX512VBMI-NEXT:    cmovneq %r9, %r11
-; AVX512VBMI-NEXT:    movq %r14, %r13
-; AVX512VBMI-NEXT:    cmovneq %r10, %r13
-; AVX512VBMI-NEXT:    movq %rbx, %rax
-; AVX512VBMI-NEXT:    cmovneq %rdx, %rax
-; AVX512VBMI-NEXT:    movq %r15, %rsi
-; AVX512VBMI-NEXT:    cmovneq %r8, %rsi
-; AVX512VBMI-NEXT:    cmovneq %r12, %r9
-; AVX512VBMI-NEXT:    cmovneq %rbx, %rdx
-; AVX512VBMI-NEXT:    cmovneq %r15, %r8
-; AVX512VBMI-NEXT:    cmovneq %r14, %r10
-; AVX512VBMI-NEXT:    testb $-128, %cl
-; AVX512VBMI-NEXT:    movq %rsi, %r14
-; AVX512VBMI-NEXT:    cmovneq %r10, %r14
-; AVX512VBMI-NEXT:    cmovneq %r8, %r10
-; AVX512VBMI-NEXT:    movq %rax, %rbx
-; AVX512VBMI-NEXT:    cmovneq %r9, %rbx
-; AVX512VBMI-NEXT:    cmovneq %rdx, %r9
-; AVX512VBMI-NEXT:    cmoveq %r13, %rsi
-; AVX512VBMI-NEXT:    cmoveq %r11, %rax
-; AVX512VBMI-NEXT:    cmovneq %r13, %r8
-; AVX512VBMI-NEXT:    cmovneq %r11, %rdx
-; AVX512VBMI-NEXT:    testb $64, %cl
-; AVX512VBMI-NEXT:    movq %rdx, %r11
-; AVX512VBMI-NEXT:    cmovneq %r8, %r11
-; AVX512VBMI-NEXT:    cmovneq %rax, %r8
-; AVX512VBMI-NEXT:    cmovneq %rsi, %rax
-; AVX512VBMI-NEXT:    cmovneq %rbx, %rsi
-; AVX512VBMI-NEXT:    cmovneq %r14, %rbx
-; AVX512VBMI-NEXT:    cmovneq %r9, %r14
-; AVX512VBMI-NEXT:    cmoveq %r10, %rdx
-; AVX512VBMI-NEXT:    cmovneq %r10, %r9
-; AVX512VBMI-NEXT:    movq %r9, %r10
-; AVX512VBMI-NEXT:    shldq %cl, %rdx, %r10
-; AVX512VBMI-NEXT:    movq %r14, %r15
-; AVX512VBMI-NEXT:    shldq %cl, %r9, %r15
-; AVX512VBMI-NEXT:    movq %rbx, %r9
-; AVX512VBMI-NEXT:    shldq %cl, %r14, %r9
-; AVX512VBMI-NEXT:    movq %rsi, %r14
-; AVX512VBMI-NEXT:    shldq %cl, %rbx, %r14
-; AVX512VBMI-NEXT:    movq %rax, %rbx
-; AVX512VBMI-NEXT:    shldq %cl, %rsi, %rbx
-; AVX512VBMI-NEXT:    movq %r8, %rsi
-; AVX512VBMI-NEXT:    shldq %cl, %rax, %rsi
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    movq %r11, %rdi
-; AVX512VBMI-NEXT:    shldq %cl, %r8, %rdi
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shldq %cl, %r11, %rdx
-; AVX512VBMI-NEXT:    movq %rdx, 56(%rax)
-; AVX512VBMI-NEXT:    movq %rdi, 48(%rax)
-; AVX512VBMI-NEXT:    movq %rsi, 40(%rax)
-; AVX512VBMI-NEXT:    movq %rbx, 32(%rax)
-; AVX512VBMI-NEXT:    movq %r14, 24(%rax)
-; AVX512VBMI-NEXT:    movq %r9, 16(%rax)
-; AVX512VBMI-NEXT:    movq %r15, 8(%rax)
-; AVX512VBMI-NEXT:    movq %r10, (%rax)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r12
-; AVX512VBMI-NEXT:    popq %r13
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    andl $511, %edx # imm = 0x1FF
+; AVX512VBMI-NEXT:    movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT:    subq %rdx, %rcx
+; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT:    # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %esi
+; AVX512VBMI-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm4 = zmm2[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT:    movl %edx, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpexpandq %zmm0, %zmm5 {%k1} {z}
+; AVX512VBMI-NEXT:    vpshrdvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm1 = zmm3[7],zmm5[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm3
+; AVX512VBMI-NEXT:    vpshldvq %zmm3, %zmm1, %zmm5
+; AVX512VBMI-NEXT:    xorl %ecx, %ecx
+; AVX512VBMI-NEXT:    negq %rdx
+; AVX512VBMI-NEXT:    sbbl %ecx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vporq %zmm2, %zmm5, %zmm0 {%k1}
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VBMI-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = call i512 @llvm.fshl.i512(i512 %a0, i512 %a0, i512 %a2)
@@ -2896,230 +2704,127 @@ define i512 @fshr_rot_i512_load(ptr %p0, i512 %a2) nounwind {
 ;
 ; AVX512F-LABEL: fshr_rot_i512_load:
 ; AVX512F:       # %bb.0:
-; AVX512F-NEXT:    pushq %r15
-; AVX512F-NEXT:    pushq %r14
-; AVX512F-NEXT:    pushq %r13
-; AVX512F-NEXT:    pushq %r12
-; AVX512F-NEXT:    pushq %rbx
-; AVX512F-NEXT:    movq %rdx, %rcx
-; AVX512F-NEXT:    movq 40(%rsi), %r8
-; AVX512F-NEXT:    movq 48(%rsi), %r9
-; AVX512F-NEXT:    movq 16(%rsi), %rax
-; AVX512F-NEXT:    movq 56(%rsi), %r14
-; AVX512F-NEXT:    movq 24(%rsi), %r11
-; AVX512F-NEXT:    movq (%rsi), %rdx
-; AVX512F-NEXT:    movq 8(%rsi), %rbx
-; AVX512F-NEXT:    movq 32(%rsi), %r12
-; AVX512F-NEXT:    testl $256, %ecx # imm = 0x100
-; AVX512F-NEXT:    movq %r12, %r15
-; AVX512F-NEXT:    cmoveq %rdx, %r15
-; AVX512F-NEXT:    movq %r11, %r13
-; AVX512F-NEXT:    cmoveq %r14, %r13
-; AVX512F-NEXT:    movq %rax, %rsi
-; AVX512F-NEXT:    cmoveq %r9, %rsi
-; AVX512F-NEXT:    movq %rbx, %r10
-; AVX512F-NEXT:    cmoveq %r8, %r10
-; AVX512F-NEXT:    cmoveq %r12, %rdx
-; AVX512F-NEXT:    cmoveq %rax, %r9
-; AVX512F-NEXT:    cmoveq %rbx, %r8
-; AVX512F-NEXT:    cmoveq %r11, %r14
-; AVX512F-NEXT:    testb $-128, %cl
-; AVX512F-NEXT:    movq %r10, %r11
-; AVX512F-NEXT:    cmoveq %r14, %r11
-; AVX512F-NEXT:    cmoveq %r8, %r14
-; AVX512F-NEXT:    movq %rsi, %rbx
-; AVX512F-NEXT:    cmoveq %rdx, %rbx
-; AVX512F-NEXT:    cmoveq %r9, %rdx
-; AVX512F-NEXT:    cmovneq %r13, %r10
-; AVX512F-NEXT:    cmovneq %r15, %rsi
-; AVX512F-NEXT:    cmoveq %r13, %r8
 ; AVX512F-NEXT:    movq %rdi, %rax
-; AVX512F-NEXT:    cmoveq %r15, %r9
-; AVX512F-NEXT:    testb $64, %cl
-; AVX512F-NEXT:    movq %r9, %rdi
-; AVX512F-NEXT:    cmoveq %r8, %rdi
-; AVX512F-NEXT:    cmoveq %rsi, %r8
-; AVX512F-NEXT:    cmoveq %r10, %rsi
-; AVX512F-NEXT:    cmoveq %rbx, %r10
-; AVX512F-NEXT:    cmoveq %r11, %rbx
-; AVX512F-NEXT:    cmoveq %rdx, %r11
-; AVX512F-NEXT:    cmoveq %r14, %rdx
-; AVX512F-NEXT:    cmovneq %r14, %r9
-; AVX512F-NEXT:    movq %r9, %r14
-; AVX512F-NEXT:    shrdq %cl, %rdx, %r14
-; AVX512F-NEXT:    shrdq %cl, %r11, %rdx
-; AVX512F-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512F-NEXT:    shrdq %cl, %r10, %rbx
-; AVX512F-NEXT:    shrdq %cl, %rsi, %r10
-; AVX512F-NEXT:    shrdq %cl, %r8, %rsi
-; AVX512F-NEXT:    shrdq %cl, %rdi, %r8
-; AVX512F-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512F-NEXT:    shrdq %cl, %r9, %rdi
-; AVX512F-NEXT:    movq %rdi, 56(%rax)
-; AVX512F-NEXT:    movq %r8, 48(%rax)
-; AVX512F-NEXT:    movq %rsi, 40(%rax)
-; AVX512F-NEXT:    movq %r10, 32(%rax)
-; AVX512F-NEXT:    movq %rbx, 24(%rax)
-; AVX512F-NEXT:    movq %r11, 16(%rax)
-; AVX512F-NEXT:    movq %rdx, 8(%rax)
-; AVX512F-NEXT:    movq %r14, (%rax)
-; AVX512F-NEXT:    popq %rbx
-; AVX512F-NEXT:    popq %r12
-; AVX512F-NEXT:    popq %r13
-; AVX512F-NEXT:    popq %r14
-; AVX512F-NEXT:    popq %r15
+; AVX512F-NEXT:    vmovdqu64 (%rsi), %zmm1
+; AVX512F-NEXT:    andl $511, %edx # imm = 0x1FF
+; AVX512F-NEXT:    vmovq %rdx, %xmm0
+; AVX512F-NEXT:    vpbroadcastq %xmm0, %xmm2
+; AVX512F-NEXT:    vpbroadcastq {{.*#+}} xmm0 = [63,63]
+; AVX512F-NEXT:    vpand %xmm0, %xmm2, %xmm3
+; AVX512F-NEXT:    movl %edx, %ecx
+; AVX512F-NEXT:    shrl $6, %ecx
+; AVX512F-NEXT:    movl $-1, %esi
+; AVX512F-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpcompressq %zmm1, %zmm4 {%k1} {z}
+; AVX512F-NEXT:    vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512F-NEXT:    vpandn %xmm0, %xmm2, %xmm2
+; AVX512F-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512F-NEXT:    valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512F-NEXT:    vpaddq %zmm4, %zmm4, %zmm4
+; AVX512F-NEXT:    vpsllq %xmm2, %zmm4, %zmm2
+; AVX512F-NEXT:    vporq %zmm3, %zmm2, %zmm2
+; AVX512F-NEXT:    movl $512, %ecx # imm = 0x200
+; AVX512F-NEXT:    subq %rdx, %rcx
+; AVX512F-NEXT:    vmovq %rcx, %xmm3
+; AVX512F-NEXT:    vpbroadcastq %xmm3, %xmm3
+; AVX512F-NEXT:    vpand %xmm0, %xmm3, %xmm4
+; AVX512F-NEXT:    shrl $6, %ecx
+; AVX512F-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vpexpandq %zmm1, %zmm6 {%k1} {z}
+; AVX512F-NEXT:    vpsllq %xmm4, %zmm6, %zmm4
+; AVX512F-NEXT:    vpandn %xmm0, %xmm3, %xmm0
+; AVX512F-NEXT:    valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512F-NEXT:    vpsrlq $1, %zmm3, %zmm3
+; AVX512F-NEXT:    vpsrlq %xmm0, %zmm3, %zmm0
+; AVX512F-NEXT:    xorl %ecx, %ecx
+; AVX512F-NEXT:    negq %rdx
+; AVX512F-NEXT:    sbbl %ecx, %ecx
+; AVX512F-NEXT:    vporq %zmm0, %zmm4, %zmm0
+; AVX512F-NEXT:    kmovw %ecx, %k1
+; AVX512F-NEXT:    vporq %zmm2, %zmm0, %zmm1 {%k1}
+; AVX512F-NEXT:    vmovdqu64 %zmm1, (%rdi)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512VL-LABEL: fshr_rot_i512_load:
 ; AVX512VL:       # %bb.0:
-; AVX512VL-NEXT:    pushq %r15
-; AVX512VL-NEXT:    pushq %r14
-; AVX512VL-NEXT:    pushq %r13
-; AVX512VL-NEXT:    pushq %r12
-; AVX512VL-NEXT:    pushq %rbx
-; AVX512VL-NEXT:    movq %rdx, %rcx
-; AVX512VL-NEXT:    movq 40(%rsi), %r8
-; AVX512VL-NEXT:    movq 48(%rsi), %r9
-; AVX512VL-NEXT:    movq 16(%rsi), %r11
-; AVX512VL-NEXT:    movq 56(%rsi), %rax
-; AVX512VL-NEXT:    movq 24(%rsi), %rbx
-; AVX512VL-NEXT:    movq (%rsi), %rdx
-; AVX512VL-NEXT:    movq 8(%rsi), %r15
-; AVX512VL-NEXT:    movq 32(%rsi), %r12
-; AVX512VL-NEXT:    testl $256, %ecx # imm = 0x100
-; AVX512VL-NEXT:    movq %r12, %r14
-; AVX512VL-NEXT:    cmoveq %rdx, %r14
-; AVX512VL-NEXT:    movq %rbx, %r13
-; AVX512VL-NEXT:    cmoveq %rax, %r13
-; AVX512VL-NEXT:    movq %r11, %rsi
-; AVX512VL-NEXT:    cmoveq %r9, %rsi
-; AVX512VL-NEXT:    movq %r15, %r10
-; AVX512VL-NEXT:    cmoveq %r8, %r10
-; AVX512VL-NEXT:    cmoveq %r12, %rdx
-; AVX512VL-NEXT:    cmoveq %r11, %r9
-; AVX512VL-NEXT:    cmoveq %r15, %r8
-; AVX512VL-NEXT:    cmoveq %rbx, %rax
-; AVX512VL-NEXT:    testb $-128, %cl
-; AVX512VL-NEXT:    movq %r10, %r11
-; AVX512VL-NEXT:    cmoveq %rax, %r11
-; AVX512VL-NEXT:    cmoveq %r8, %rax
-; AVX512VL-NEXT:    movq %rsi, %rbx
-; AVX512VL-NEXT:    cmoveq %rdx, %rbx
-; AVX512VL-NEXT:    cmoveq %r9, %rdx
-; AVX512VL-NEXT:    cmovneq %r13, %r10
-; AVX512VL-NEXT:    cmovneq %r14, %rsi
-; AVX512VL-NEXT:    cmoveq %r13, %r8
-; AVX512VL-NEXT:    cmoveq %r14, %r9
-; AVX512VL-NEXT:    testb $64, %cl
-; AVX512VL-NEXT:    movq %r9, %r14
-; AVX512VL-NEXT:    cmoveq %r8, %r14
-; AVX512VL-NEXT:    cmoveq %rsi, %r8
-; AVX512VL-NEXT:    cmoveq %r10, %rsi
-; AVX512VL-NEXT:    cmoveq %rbx, %r10
-; AVX512VL-NEXT:    cmoveq %r11, %rbx
-; AVX512VL-NEXT:    cmoveq %rdx, %r11
-; AVX512VL-NEXT:    cmoveq %rax, %rdx
-; AVX512VL-NEXT:    cmovneq %rax, %r9
-; AVX512VL-NEXT:    movq %r9, %r15
-; AVX512VL-NEXT:    shrdq %cl, %rdx, %r15
-; AVX512VL-NEXT:    shrdq %cl, %r11, %rdx
-; AVX512VL-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VL-NEXT:    shrdq %cl, %r10, %rbx
-; AVX512VL-NEXT:    shrdq %cl, %rsi, %r10
-; AVX512VL-NEXT:    shrdq %cl, %r8, %rsi
 ; AVX512VL-NEXT:    movq %rdi, %rax
-; AVX512VL-NEXT:    shrdq %cl, %r14, %r8
-; AVX512VL-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VL-NEXT:    shrdq %cl, %r9, %r14
-; AVX512VL-NEXT:    movq %r14, 56(%rdi)
-; AVX512VL-NEXT:    movq %r8, 48(%rdi)
-; AVX512VL-NEXT:    movq %rsi, 40(%rdi)
-; AVX512VL-NEXT:    movq %r10, 32(%rdi)
-; AVX512VL-NEXT:    movq %rbx, 24(%rdi)
-; AVX512VL-NEXT:    movq %r11, 16(%rdi)
-; AVX512VL-NEXT:    movq %rdx, 8(%rdi)
-; AVX512VL-NEXT:    movq %r15, (%rdi)
-; AVX512VL-NEXT:    popq %rbx
-; AVX512VL-NEXT:    popq %r12
-; AVX512VL-NEXT:    popq %r13
-; AVX512VL-NEXT:    popq %r14
-; AVX512VL-NEXT:    popq %r15
+; AVX512VL-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VL-NEXT:    andl $511, %edx # imm = 0x1FF
+; AVX512VL-NEXT:    vpbroadcastq %rdx, %xmm1
+; AVX512VL-NEXT:    vpbroadcastq {{.*#+}} xmm2 = [63,63]
+; AVX512VL-NEXT:    vpand %xmm2, %xmm1, %xmm3
+; AVX512VL-NEXT:    movl %edx, %ecx
+; AVX512VL-NEXT:    shrl $6, %ecx
+; AVX512VL-NEXT:    movl $-1, %esi
+; AVX512VL-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpcompressq %zmm0, %zmm4 {%k1} {z}
+; AVX512VL-NEXT:    vpsrlq %xmm3, %zmm4, %zmm3
+; AVX512VL-NEXT:    vpandn %xmm2, %xmm1, %xmm1
+; AVX512VL-NEXT:    vpxor %xmm5, %xmm5, %xmm5
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm4 = zmm4[1,2,3,4,5,6,7],zmm5[0]
+; AVX512VL-NEXT:    vpaddq %zmm4, %zmm4, %zmm4
+; AVX512VL-NEXT:    vpsllq %xmm1, %zmm4, %zmm1
+; AVX512VL-NEXT:    vporq %zmm3, %zmm1, %zmm1
+; AVX512VL-NEXT:    movl $512, %ecx # imm = 0x200
+; AVX512VL-NEXT:    subq %rdx, %rcx
+; AVX512VL-NEXT:    vpbroadcastq %rcx, %xmm3
+; AVX512VL-NEXT:    vpand %xmm2, %xmm3, %xmm4
+; AVX512VL-NEXT:    shrl $6, %ecx
+; AVX512VL-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vpexpandq %zmm0, %zmm6 {%k1} {z}
+; AVX512VL-NEXT:    vpsllq %xmm4, %zmm6, %zmm4
+; AVX512VL-NEXT:    vpandn %xmm2, %xmm3, %xmm2
+; AVX512VL-NEXT:    valignq {{.*#+}} zmm3 = zmm5[7],zmm6[0,1,2,3,4,5,6]
+; AVX512VL-NEXT:    vpsrlq $1, %zmm3, %zmm3
+; AVX512VL-NEXT:    vpsrlq %xmm2, %zmm3, %zmm2
+; AVX512VL-NEXT:    vporq %zmm2, %zmm4, %zmm2
+; AVX512VL-NEXT:    xorl %ecx, %ecx
+; AVX512VL-NEXT:    negq %rdx
+; AVX512VL-NEXT:    sbbl %ecx, %ecx
+; AVX512VL-NEXT:    kmovd %ecx, %k1
+; AVX512VL-NEXT:    vporq %zmm1, %zmm2, %zmm0 {%k1}
+; AVX512VL-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VL-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
 ;
 ; AVX512VBMI-LABEL: fshr_rot_i512_load:
 ; AVX512VBMI:       # %bb.0:
-; AVX512VBMI-NEXT:    pushq %r15
-; AVX512VBMI-NEXT:    pushq %r14
-; AVX512VBMI-NEXT:    pushq %r13
-; AVX512VBMI-NEXT:    pushq %r12
-; AVX512VBMI-NEXT:    pushq %rbx
-; AVX512VBMI-NEXT:    movq %rdx, %rcx
-; AVX512VBMI-NEXT:    movq 40(%rsi), %r8
-; AVX512VBMI-NEXT:    movq 48(%rsi), %r9
-; AVX512VBMI-NEXT:    movq 16(%rsi), %r11
-; AVX512VBMI-NEXT:    movq 56(%rsi), %rax
-; AVX512VBMI-NEXT:    movq 24(%rsi), %rbx
-; AVX512VBMI-NEXT:    movq (%rsi), %rdx
-; AVX512VBMI-NEXT:    movq 8(%rsi), %r15
-; AVX512VBMI-NEXT:    movq 32(%rsi), %r12
-; AVX512VBMI-NEXT:    testl $256, %ecx # imm = 0x100
-; AVX512VBMI-NEXT:    movq %r12, %r14
-; AVX512VBMI-NEXT:    cmoveq %rdx, %r14
-; AVX512VBMI-NEXT:    movq %rbx, %r13
-; AVX512VBMI-NEXT:    cmoveq %rax, %r13
-; AVX512VBMI-NEXT:    movq %r11, %rsi
-; AVX512VBMI-NEXT:    cmoveq %r9, %rsi
-; AVX512VBMI-NEXT:    movq %r15, %r10
-; AVX512VBMI-NEXT:    cmoveq %r8, %r10
-; AVX512VBMI-NEXT:    cmoveq %r12, %rdx
-; AVX512VBMI-NEXT:    cmoveq %r11, %r9
-; AVX512VBMI-NEXT:    cmoveq %r15, %r8
-; AVX512VBMI-NEXT:    cmoveq %rbx, %rax
-; AVX512VBMI-NEXT:    testb $-128, %cl
-; AVX512VBMI-NEXT:    movq %r10, %r11
-; AVX512VBMI-NEXT:    cmoveq %rax, %r11
-; AVX512VBMI-NEXT:    cmoveq %r8, %rax
-; AVX512VBMI-NEXT:    movq %rsi, %rbx
-; AVX512VBMI-NEXT:    cmoveq %rdx, %rbx
-; AVX512VBMI-NEXT:    cmoveq %r9, %rdx
-; AVX512VBMI-NEXT:    cmovneq %r13, %r10
-; AVX512VBMI-NEXT:    cmovneq %r14, %rsi
-; AVX512VBMI-NEXT:    cmoveq %r13, %r8
-; AVX512VBMI-NEXT:    cmoveq %r14, %r9
-; AVX512VBMI-NEXT:    testb $64, %cl
-; AVX512VBMI-NEXT:    movq %r9, %r14
-; AVX512VBMI-NEXT:    cmoveq %r8, %r14
-; AVX512VBMI-NEXT:    cmoveq %rsi, %r8
-; AVX512VBMI-NEXT:    cmoveq %r10, %rsi
-; AVX512VBMI-NEXT:    cmoveq %rbx, %r10
-; AVX512VBMI-NEXT:    cmoveq %r11, %rbx
-; AVX512VBMI-NEXT:    cmoveq %rdx, %r11
-; AVX512VBMI-NEXT:    cmoveq %rax, %rdx
-; AVX512VBMI-NEXT:    cmovneq %rax, %r9
-; AVX512VBMI-NEXT:    movq %r9, %r15
-; AVX512VBMI-NEXT:    shrdq %cl, %rdx, %r15
-; AVX512VBMI-NEXT:    shrdq %cl, %r11, %rdx
-; AVX512VBMI-NEXT:    shrdq %cl, %rbx, %r11
-; AVX512VBMI-NEXT:    shrdq %cl, %r10, %rbx
-; AVX512VBMI-NEXT:    shrdq %cl, %rsi, %r10
-; AVX512VBMI-NEXT:    shrdq %cl, %r8, %rsi
 ; AVX512VBMI-NEXT:    movq %rdi, %rax
-; AVX512VBMI-NEXT:    shrdq %cl, %r14, %r8
-; AVX512VBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
-; AVX512VBMI-NEXT:    shrdq %cl, %r9, %r14
-; AVX512VBMI-NEXT:    movq %r14, 56(%rdi)
-; AVX512VBMI-NEXT:    movq %r8, 48(%rdi)
-; AVX512VBMI-NEXT:    movq %rsi, 40(%rdi)
-; AVX512VBMI-NEXT:    movq %r10, 32(%rdi)
-; AVX512VBMI-NEXT:    movq %rbx, 24(%rdi)
-; AVX512VBMI-NEXT:    movq %r11, 16(%rdi)
-; AVX512VBMI-NEXT:    movq %rdx, 8(%rdi)
-; AVX512VBMI-NEXT:    movq %r15, (%rdi)
-; AVX512VBMI-NEXT:    popq %rbx
-; AVX512VBMI-NEXT:    popq %r12
-; AVX512VBMI-NEXT:    popq %r13
-; AVX512VBMI-NEXT:    popq %r14
-; AVX512VBMI-NEXT:    popq %r15
+; AVX512VBMI-NEXT:    vmovdqu64 (%rsi), %zmm0
+; AVX512VBMI-NEXT:    andl $511, %edx # imm = 0x1FF
+; AVX512VBMI-NEXT:    movl $512, %ecx # imm = 0x200
+; AVX512VBMI-NEXT:    subq %rdx, %rcx
+; AVX512VBMI-NEXT:    vpbroadcastq %rcx, %zmm1
+; AVX512VBMI-NEXT:    # kill: def $ecx killed $ecx killed $rcx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    movl $-1, %esi
+; AVX512VBMI-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpexpandq %zmm0, %zmm2 {%k1} {z}
+; AVX512VBMI-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm4 = zmm3[7],zmm2[0,1,2,3,4,5,6]
+; AVX512VBMI-NEXT:    movl %edx, %ecx
+; AVX512VBMI-NEXT:    shrl $6, %ecx
+; AVX512VBMI-NEXT:    shlxl %ecx, %esi, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vpcompressq %zmm0, %zmm5 {%k1} {z}
+; AVX512VBMI-NEXT:    vpshldvq %zmm1, %zmm4, %zmm2
+; AVX512VBMI-NEXT:    valignq {{.*#+}} zmm1 = zmm5[1,2,3,4,5,6,7],zmm3[0]
+; AVX512VBMI-NEXT:    vpbroadcastq %rdx, %zmm3
+; AVX512VBMI-NEXT:    vpshrdvq %zmm3, %zmm1, %zmm5
+; AVX512VBMI-NEXT:    xorl %ecx, %ecx
+; AVX512VBMI-NEXT:    negq %rdx
+; AVX512VBMI-NEXT:    sbbl %ecx, %ecx
+; AVX512VBMI-NEXT:    kmovd %ecx, %k1
+; AVX512VBMI-NEXT:    vporq %zmm5, %zmm2, %zmm0 {%k1}
+; AVX512VBMI-NEXT:    vextracti64x4 $1, %zmm0, 32(%rdi)
+; AVX512VBMI-NEXT:    vmovdqu %ymm0, (%rdi)
+; AVX512VBMI-NEXT:    vzeroupper
 ; AVX512VBMI-NEXT:    retq
   %a0 = load i512, ptr %p0
   %r = call i512 @llvm.fshr.i512(i512 %a0, i512 %a0, i512 %a2)
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 41f4528bfdc65..e4a21fcebcbe2 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -1482,312 +1482,329 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    pushq %r13
 ; SSE4-NEXT:    pushq %r12
 ; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    subq $120, %rsp
 ; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movdqa {{.*#+}} xmm0 = [18446744073709551615,127]
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT:    pand %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE4-NEXT:    pand %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE4-NEXT:    pand %xmm0, %xmm5
+; SSE4-NEXT:    pshufd {{.*#+}} xmm6 = xmm5[2,3,2,3]
+; SSE4-NEXT:    movq %xmm6, %rax
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE4-NEXT:    pand %xmm0, %xmm6
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE4-NEXT:    pand %xmm0, %xmm7
+; SSE4-NEXT:    pshufd {{.*#+}} xmm8 = xmm7[2,3,2,3]
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm9
+; SSE4-NEXT:    pand %xmm0, %xmm9
+; SSE4-NEXT:    pshufd {{.*#+}} xmm10 = xmm9[2,3,2,3]
+; SSE4-NEXT:    movq %xmm10, %r13
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm10
+; SSE4-NEXT:    pand %xmm0, %xmm10
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm11
+; SSE4-NEXT:    pand %xmm0, %xmm11
+; SSE4-NEXT:    pshufd {{.*#+}} xmm12 = xmm11[2,3,2,3]
+; SSE4-NEXT:    movq %xmm12, %rcx
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm13
+; SSE4-NEXT:    pand %xmm0, %xmm13
+; SSE4-NEXT:    pshufd {{.*#+}} xmm12 = xmm13[2,3,2,3]
+; SSE4-NEXT:    movq %xmm12, %r14
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm12
+; SSE4-NEXT:    pand %xmm0, %xmm12
+; SSE4-NEXT:    pshufd {{.*#+}} xmm14 = xmm12[2,3,2,3]
+; SSE4-NEXT:    movq %xmm14, %rbp
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT:    pand %xmm0, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3]
+; SSE4-NEXT:    movq %xmm15, %r10
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm15
+; SSE4-NEXT:    pand %xmm0, %xmm15
+; SSE4-NEXT:    movq %xmm12, %r11
+; SSE4-NEXT:    pshufd {{.*#+}} xmm12 = xmm15[2,3,2,3]
+; SSE4-NEXT:    movq %xmm14, %rbx
 ; SSE4-NEXT:    andl $127, %edx
 ; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; SSE4-NEXT:    andl $127, %r8d
 ; SSE4-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE4-NEXT:    andl $127, %r10d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    andl $127, %ecx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE4-NEXT:    andl $127, %r8d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE4-NEXT:    andl $127, %ebx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT:    andl $127, %edx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT:    andl $127, %r13d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    andl $127, %r11d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    andl $127, %r14d
 ; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
 ; SSE4-NEXT:    andl $127, %r12d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE4-NEXT:    cmpq %rax, %rbp
-; SSE4-NEXT:    movq %r12, %r15
-; SSE4-NEXT:    sbbq %r14, %r15
-; SSE4-NEXT:    setb %r15b
-; SSE4-NEXT:    cmpq %rbp, %rax
-; SSE4-NEXT:    sbbq %r12, %r14
-; SSE4-NEXT:    sbbb $0, %r15b
-; SSE4-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
-; SSE4-NEXT:    movq %r11, %r15
-; SSE4-NEXT:    sbbq %r13, %r15
+; SSE4-NEXT:    cmpq %r11, %rbx
+; SSE4-NEXT:    movq %r10, %r15
+; SSE4-NEXT:    sbbq %rbp, %r15
+; SSE4-NEXT:    movq %xmm12, %r15
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT:    pand %xmm0, %xmm14
+; SSE4-NEXT:    pshufd {{.*#+}} xmm12 = xmm14[2,3,2,3]
+; SSE4-NEXT:    setb %dl
+; SSE4-NEXT:    cmpq %rbx, %r11
+; SSE4-NEXT:    movq %xmm12, %r11
+; SSE4-NEXT:    pshufd {{.*#+}} xmm12 = xmm10[2,3,2,3]
+; SSE4-NEXT:    movq %xmm15, %rbx
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm15
+; SSE4-NEXT:    sbbq %r10, %rbp
+; SSE4-NEXT:    pand %xmm0, %xmm15
+; SSE4-NEXT:    movq %xmm14, %r8
+; SSE4-NEXT:    sbbb $0, %dl
+; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    cmpq %rbx, %r8
+; SSE4-NEXT:    movq %r11, %r10
+; SSE4-NEXT:    sbbq %r15, %r10
+; SSE4-NEXT:    pshufd {{.*#+}} xmm14 = xmm15[2,3,2,3]
+; SSE4-NEXT:    setb %dl
+; SSE4-NEXT:    cmpq %r8, %rbx
+; SSE4-NEXT:    movq %xmm14, %r8
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT:    pand %xmm0, %xmm14
+; SSE4-NEXT:    sbbq %r11, %r15
+; SSE4-NEXT:    movq %xmm13, %r10
+; SSE4-NEXT:    movq %xmm15, %r11
+; SSE4-NEXT:    sbbb $0, %dl
+; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    cmpq %r10, %r11
+; SSE4-NEXT:    movq %r8, %rbx
+; SSE4-NEXT:    sbbq %r14, %rbx
+; SSE4-NEXT:    pshufd {{.*#+}} xmm13 = xmm14[2,3,2,3]
+; SSE4-NEXT:    movq %xmm13, %rbx
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm15
+; SSE4-NEXT:    pand %xmm0, %xmm15
+; SSE4-NEXT:    pshufd {{.*#+}} xmm13 = xmm15[2,3,2,3]
 ; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r11, %r13
+; SSE4-NEXT:    cmpq %r11, %r10
+; SSE4-NEXT:    movq %xmm13, %r10
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm13
+; SSE4-NEXT:    pand %xmm0, %xmm13
+; SSE4-NEXT:    movq %xmm14, %r11
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm14
+; SSE4-NEXT:    pand %xmm0, %xmm14
+; SSE4-NEXT:    sbbq %r8, %r14
+; SSE4-NEXT:    movq %xmm15, %rdx
+; SSE4-NEXT:    pshufd {{.*#+}} xmm15 = xmm14[2,3,2,3]
 ; SSE4-NEXT:    sbbb $0, %bpl
 ; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    cmpq %rax, %r11
-; SSE4-NEXT:    movq %rdx, %r14
-; SSE4-NEXT:    sbbq %rbx, %r14
+; SSE4-NEXT:    cmpq %r11, %rdx
+; SSE4-NEXT:    movq %r10, %r8
+; SSE4-NEXT:    sbbq %rbx, %r8
 ; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r11, %rax
-; SSE4-NEXT:    sbbq %rdx, %rbx
+; SSE4-NEXT:    cmpq %rdx, %r11
+; SSE4-NEXT:    movq %xmm15, %rdx
+; SSE4-NEXT:    sbbq %r10, %rbx
+; SSE4-NEXT:    movq %xmm11, %r8
+; SSE4-NEXT:    movq %xmm14, %r10
 ; SSE4-NEXT:    sbbb $0, %bpl
 ; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT:    cmpq %rax, %rdx
-; SSE4-NEXT:    movq %r8, %r11
+; SSE4-NEXT:    cmpq %r8, %r10
+; SSE4-NEXT:    movq %rdx, %r11
 ; SSE4-NEXT:    sbbq %rcx, %r11
-; SSE4-NEXT:    setb %r11b
-; SSE4-NEXT:    cmpq %rdx, %rax
-; SSE4-NEXT:    sbbq %r8, %rcx
-; SSE4-NEXT:    sbbb $0, %r11b
-; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE4-NEXT:    pshufd {{.*#+}} xmm11 = xmm13[2,3,2,3]
+; SSE4-NEXT:    movq %xmm11, %r11
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm11
+; SSE4-NEXT:    pand %xmm0, %xmm11
+; SSE4-NEXT:    pshufd {{.*#+}} xmm14 = xmm11[2,3,2,3]
+; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    cmpq %r10, %r8
+; SSE4-NEXT:    movq %xmm14, %r8
+; SSE4-NEXT:    movq %xmm13, %r10
+; SSE4-NEXT:    sbbq %rdx, %rcx
+; SSE4-NEXT:    movq %xmm11, %rcx
+; SSE4-NEXT:    sbbb $0, %bl
+; SSE4-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    cmpq %r10, %rcx
 ; SSE4-NEXT:    movq %r8, %rdx
-; SSE4-NEXT:    sbbq %r10, %rdx
-; SSE4-NEXT:    setb %dl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r8, %r10
-; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
+; SSE4-NEXT:    sbbq %r11, %rdx
+; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    cmpq %rcx, %r10
+; SSE4-NEXT:    movq %xmm12, %rcx
+; SSE4-NEXT:    movq %xmm9, %rdx
+; SSE4-NEXT:    sbbq %r8, %r11
+; SSE4-NEXT:    movq %xmm10, %r8
+; SSE4-NEXT:    sbbb $0, %bl
+; SSE4-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    cmpq %rdx, %r8
+; SSE4-NEXT:    movq %rcx, %r10
+; SSE4-NEXT:    sbbq %r13, %r10
+; SSE4-NEXT:    movq %xmm8, %r11
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm8
+; SSE4-NEXT:    pand %xmm0, %xmm8
+; SSE4-NEXT:    pshufd {{.*#+}} xmm9 = xmm8[2,3,2,3]
 ; SSE4-NEXT:    setb %r10b
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
+; SSE4-NEXT:    cmpq %r8, %rdx
+; SSE4-NEXT:    movq %xmm9, %rdx
+; SSE4-NEXT:    sbbq %rcx, %r13
+; SSE4-NEXT:    movq %xmm7, %rcx
+; SSE4-NEXT:    movq %xmm8, %r8
 ; SSE4-NEXT:    sbbb $0, %r10b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %dl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
+; SSE4-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    cmpq %rcx, %r8
+; SSE4-NEXT:    movq %rdx, %rbx
+; SSE4-NEXT:    sbbq %r11, %rbx
+; SSE4-NEXT:    pshufd {{.*#+}} xmm7 = xmm6[2,3,2,3]
+; SSE4-NEXT:    setb %r13b
+; SSE4-NEXT:    cmpq %r8, %rcx
+; SSE4-NEXT:    movq %xmm7, %r8
+; SSE4-NEXT:    movq %xmm5, %rbx
+; SSE4-NEXT:    sbbq %rdx, %r11
+; SSE4-NEXT:    movq %xmm6, %rdx
+; SSE4-NEXT:    sbbb $0, %r13b
+; SSE4-NEXT:    cmpq %rbx, %rdx
+; SSE4-NEXT:    movq %r8, %rcx
+; SSE4-NEXT:    sbbq %rax, %rcx
+; SSE4-NEXT:    movq %xmm4, %r11
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE4-NEXT:    pand %xmm0, %xmm4
+; SSE4-NEXT:    pshufd {{.*#+}} xmm5 = xmm4[2,3,2,3]
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    cmpq %rdx, %rbx
+; SSE4-NEXT:    movq %xmm5, %rbx
+; SSE4-NEXT:    movq %xmm3, %r15
+; SSE4-NEXT:    sbbq %r8, %rax
+; SSE4-NEXT:    movq %xmm4, %rax
+; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    cmpq %r15, %rax
+; SSE4-NEXT:    movq %rbx, %rdx
+; SSE4-NEXT:    sbbq %r11, %rdx
 ; SSE4-NEXT:    setb %dl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
+; SSE4-NEXT:    cmpq %rax, %r15
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT:    pand %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT:    movq %xmm3, %r15
+; SSE4-NEXT:    sbbq %rbx, %r11
+; SSE4-NEXT:    movq %xmm1, %r11
+; SSE4-NEXT:    movq %xmm2, %rbx
 ; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT:    movq %r14, %r8
-; SSE4-NEXT:    movq (%rsp), %rbx # 8-byte Reload
-; SSE4-NEXT:    sbbq %rbx, %r8
+; SSE4-NEXT:    cmpq %r11, %rbx
+; SSE4-NEXT:    movq %r15, %r8
+; SSE4-NEXT:    sbbq %rax, %r8
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT:    pand %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %rbx, %r11
+; SSE4-NEXT:    movq %xmm2, %rbx
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT:    pand %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT:    movq %xmm3, %rbp
+; SSE4-NEXT:    sbbq %r15, %rax
+; SSE4-NEXT:    movq %xmm1, %rax
+; SSE4-NEXT:    movq %xmm2, %r15
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    cmpq %rax, %r15
+; SSE4-NEXT:    movq %rbp, %r11
+; SSE4-NEXT:    sbbq %rbx, %r11
 ; SSE4-NEXT:    setb %r11b
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %rbx
+; SSE4-NEXT:    cmpq %r15, %rax
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT:    pand %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT:    pand %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm3 = xmm2[2,3,2,3]
+; SSE4-NEXT:    sbbq %rbp, %rbx
+; SSE4-NEXT:    movq %xmm1, %r15
+; SSE4-NEXT:    movq %xmm2, %rbp
 ; SSE4-NEXT:    sbbb $0, %r11b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    cmpq %r15, %rbp
+; SSE4-NEXT:    movq %xmm3, %r14
 ; SSE4-NEXT:    movq %r14, %rbx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rbx
+; SSE4-NEXT:    sbbq %rax, %rbx
 ; SSE4-NEXT:    setb %bl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %r8
+; SSE4-NEXT:    cmpq %rbp, %r15
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT:    pand %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    sbbq %r14, %rax
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movq %xmm1, %r14
 ; SSE4-NEXT:    sbbb $0, %bl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE4-NEXT:    movq %r15, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rcx
-; SSE4-NEXT:    setb %cl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r15, %r8
-; SSE4-NEXT:    sbbb $0, %cl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE4-NEXT:    cmpq %rax, %r15
+; SSE4-NEXT:    cmpq %r9, %r14
+; SSE4-NEXT:    movq %rax, %r15
+; SSE4-NEXT:    sbbq %r12, %r15
+; SSE4-NEXT:    setb %bpl
+; SSE4-NEXT:    cmpq %r14, %r9
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT:    pand %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    sbbq %rax, %r12
+; SSE4-NEXT:    movq %xmm2, %rax
+; SSE4-NEXT:    movq %xmm1, %r14
+; SSE4-NEXT:    sbbb $0, %bpl
 ; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r14
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r14
+; SSE4-NEXT:    cmpq %r12, %r14
+; SSE4-NEXT:    movq %rax, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    sbbq %rcx, %r9
+; SSE4-NEXT:    movq %rdi, %r15
+; SSE4-NEXT:    setb %r9b
+; SSE4-NEXT:    cmpq %r14, %r12
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE4-NEXT:    pand %xmm0, %xmm1
+; SSE4-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE4-NEXT:    pand %xmm0, %xmm3
+; SSE4-NEXT:    pshufd {{.*#+}} xmm4 = xmm3[2,3,2,3]
+; SSE4-NEXT:    movq %xmm4, %r12
+; SSE4-NEXT:    sbbq %rax, %rcx
+; SSE4-NEXT:    movq %xmm3, %rax
+; SSE4-NEXT:    sbbb $0, %r9b
+; SSE4-NEXT:    cmpq %rsi, %rax
+; SSE4-NEXT:    movq %r12, %rdi
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; SSE4-NEXT:    sbbq %rcx, %rdi
+; SSE4-NEXT:    movq %xmm2, %rdi
+; SSE4-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE4-NEXT:    pand %xmm0, %xmm2
+; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[2,3,2,3]
 ; SSE4-NEXT:    setb %r14b
-; SSE4-NEXT:    cmpq %r15, %rax
-; SSE4-NEXT:    sbbq %r12, %r8
+; SSE4-NEXT:    cmpq %rax, %rsi
+; SSE4-NEXT:    movq %xmm0, %rsi
+; SSE4-NEXT:    sbbq %r12, %rcx
+; SSE4-NEXT:    movq %xmm1, %r12
+; SSE4-NEXT:    movq %xmm2, %rax
 ; SSE4-NEXT:    sbbb $0, %r14b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    cmpq %r9, %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r15
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r15
-; SSE4-NEXT:    setb %r15b
-; SSE4-NEXT:    cmpq %rax, %r9
-; SSE4-NEXT:    sbbq %r12, %r8
-; SSE4-NEXT:    sbbb $0, %r15b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
 ; SSE4-NEXT:    cmpq %r12, %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE4-NEXT:    movq %r13, %r9
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r9
-; SSE4-NEXT:    setb %r9b
+; SSE4-NEXT:    movq %rsi, %rcx
+; SSE4-NEXT:    sbbq %rdi, %rcx
+; SSE4-NEXT:    setb %cl
 ; SSE4-NEXT:    cmpq %rax, %r12
-; SSE4-NEXT:    sbbq %r13, %r8
-; SSE4-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    sbbb $0, %r9b
-; SSE4-NEXT:    cmpq %rsi, %r12
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rdi
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rdi
-; SSE4-NEXT:    setb %dil
-; SSE4-NEXT:    cmpq %r12, %rsi
-; SSE4-NEXT:    sbbq %r8, %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT:    sbbb $0, %dil
-; SSE4-NEXT:    cmpq %r12, %r13
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rsi
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rsi
-; SSE4-NEXT:    setb %sil
-; SSE4-NEXT:    cmpq %r13, %r12
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm1
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm2
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm3
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm4
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm5
-; SSE4-NEXT:    movzbl %r10b, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm6
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r10d, %xmm7
-; SSE4-NEXT:    movzbl %bpl, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm0
-; SSE4-NEXT:    movzbl %dl, %edx
-; SSE4-NEXT:    movd %edx, %xmm8
-; SSE4-NEXT:    movzbl %r11b, %edx
-; SSE4-NEXT:    movd %edx, %xmm9
-; SSE4-NEXT:    movzbl %bl, %edx
-; SSE4-NEXT:    movd %edx, %xmm10
-; SSE4-NEXT:    movzbl %cl, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm11
-; SSE4-NEXT:    movzbl %r14b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm12
-; SSE4-NEXT:    movzbl %r15b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm13
-; SSE4-NEXT:    movzbl %r9b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm14
-; SSE4-NEXT:    movzbl %dil, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm15
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm1
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm2
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm3
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm4
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm5
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm6
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE4-NEXT:    movd %eax, %xmm7
+; SSE4-NEXT:    movzbl %r13b, %eax
+; SSE4-NEXT:    movd %eax, %xmm0
+; SSE4-NEXT:    movzbl %r10b, %eax
+; SSE4-NEXT:    movd %eax, %xmm8
+; SSE4-NEXT:    movzbl %dl, %eax
+; SSE4-NEXT:    movd %eax, %xmm9
+; SSE4-NEXT:    movzbl %r8b, %eax
+; SSE4-NEXT:    movd %eax, %xmm10
+; SSE4-NEXT:    movzbl %r11b, %eax
+; SSE4-NEXT:    movd %eax, %xmm11
+; SSE4-NEXT:    movzbl %bl, %eax
+; SSE4-NEXT:    movd %eax, %xmm12
+; SSE4-NEXT:    movzbl %bpl, %eax
+; SSE4-NEXT:    movd %eax, %xmm13
+; SSE4-NEXT:    movzbl %r9b, %eax
+; SSE4-NEXT:    movd %eax, %xmm14
+; SSE4-NEXT:    movzbl %r14b, %eax
+; SSE4-NEXT:    movd %eax, %xmm15
 ; SSE4-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
 ; SSE4-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
@@ -1802,77 +1819,76 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE4-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
 ; SSE4-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; SSE4-NEXT:    sbbq %r8, %rax
-; SSE4-NEXT:    sbbb $0, %sil
+; SSE4-NEXT:    sbbq %rsi, %rdi
+; SSE4-NEXT:    sbbb $0, %cl
 ; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
-; SSE4-NEXT:    movzbl %sil, %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    movb %cl, 4(%rax)
+; SSE4-NEXT:    movzbl %cl, %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    movb %al, 4(%r15)
 ; SSE4-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    leaq (%rcx,%rax,4), %rax
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $4, %ecx
+; SSE4-NEXT:    orq %rax, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shll $6, %eax
+; SSE4-NEXT:    orq %rcx, %rax
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $8, %ecx
+; SSE4-NEXT:    orq %rax, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shll $10, %eax
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    leaq (%rdx,%rcx,4), %rcx
+; SSE4-NEXT:    shll $12, %edx
+; SSE4-NEXT:    orq %rax, %rdx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT:    andl $3, %esi
+; SSE4-NEXT:    shll $14, %esi
+; SSE4-NEXT:    orq %rdx, %rsi
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shll $16, %eax
+; SSE4-NEXT:    orq %rsi, %rax
+; SSE4-NEXT:    orq %rcx, %rax
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $18, %ecx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $4, %edx
+; SSE4-NEXT:    shll $20, %edx
 ; SSE4-NEXT:    orq %rcx, %rdx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $6, %ecx
+; SSE4-NEXT:    shll $22, %ecx
 ; SSE4-NEXT:    orq %rdx, %rcx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $8, %edx
+; SSE4-NEXT:    shll $24, %edx
 ; SSE4-NEXT:    orq %rcx, %rdx
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
 ; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $10, %ecx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $12, %esi
-; SSE4-NEXT:    orq %rcx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT:    andl $3, %edi
-; SSE4-NEXT:    shll $14, %edi
-; SSE4-NEXT:    orq %rsi, %rdi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $16, %ecx
-; SSE4-NEXT:    orq %rdi, %rcx
+; SSE4-NEXT:    shlq $26, %rcx
 ; SSE4-NEXT:    orq %rdx, %rcx
+; SSE4-NEXT:    orq %rax, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; SSE4-NEXT:    andl $3, %eax
+; SSE4-NEXT:    shlq $28, %rax
 ; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
 ; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $18, %edx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $20, %esi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $22, %edx
-; SSE4-NEXT:    orq %rsi, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $24, %esi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shlq $26, %rdx
-; SSE4-NEXT:    orq %rsi, %rdx
+; SSE4-NEXT:    shlq $30, %rdx
+; SSE4-NEXT:    orq %rax, %rdx
 ; SSE4-NEXT:    orq %rcx, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shlq $28, %rcx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shlq $30, %rsi
-; SSE4-NEXT:    orq %rcx, %rsi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movl %esi, (%rax)
-; SSE4-NEXT:    addq $120, %rsp
+; SSE4-NEXT:    movl %edx, (%r15)
+; SSE4-NEXT:    movq %r15, %rax
 ; SSE4-NEXT:    popq %rbx
 ; SSE4-NEXT:    popq %r12
 ; SSE4-NEXT:    popq %r13
@@ -1889,356 +1905,336 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; SSE2-NEXT:    pushq %r13
 ; SSE2-NEXT:    pushq %r12
 ; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    subq $96, %rsp
-; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    andl $127, %ebx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    andl $127, %r14d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT:    andl $127, %r15d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT:    andl $127, %r12d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE2-NEXT:    andl $127, %r13d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT:    andl $127, %ebp
+; SSE2-NEXT:    movdqa {{.*#+}} xmm0 = [18446744073709551615,127]
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pextrq $1, %xmm1, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pextrq $1, %xmm2, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Folded Spill
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pextrq $1, %xmm3, %rbx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pextrq $1, %xmm4, %r14
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pextrq $1, %xmm5, %r12
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    pextrq $1, %xmm6, %r11
+; SSE2-NEXT:    movq %xmm6, %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    andl $127, %r10d
 ; SSE2-NEXT:    andl $127, %edx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT:    andl $127, %r11d
-; SSE2-NEXT:    movq %r8, %rcx
-; SSE2-NEXT:    andl $127, %ecx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT:    cmpq %rsi, %r8
-; SSE2-NEXT:    movq %rax, %r10
-; SSE2-NEXT:    sbbq %rcx, %r10
-; SSE2-NEXT:    setb %r10b
-; SSE2-NEXT:    cmpq %r8, %rsi
-; SSE2-NEXT:    sbbq %rax, %rcx
+; SSE2-NEXT:    andl $127, %r8d
+; SSE2-NEXT:    cmpq %rcx, %rax
+; SSE2-NEXT:    movq %r11, %r15
+; SSE2-NEXT:    sbbq %r8, %r15
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    pextrq $1, %xmm6, %rax
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm8
+; SSE2-NEXT:    pextrq $1, %xmm8, %r15
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    sbbq %r11, %r8
+; SSE2-NEXT:    movq %xmm8, %rcx
+; SSE2-NEXT:    sbbb $0, %bpl
+; SSE2-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    cmpq %rsi, %rcx
+; SSE2-NEXT:    movq %r15, %r8
+; SSE2-NEXT:    sbbq %rdx, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    cmpq %rcx, %rsi
+; SSE2-NEXT:    pextrq $1, %xmm7, %rcx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm8
+; SSE2-NEXT:    pand %xmm0, %xmm8
+; SSE2-NEXT:    pextrq $1, %xmm8, %rsi
+; SSE2-NEXT:    sbbq %r15, %rdx
 ; SSE2-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    sbbb $0, %r10b
-; SSE2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    cmpq %r10, %rax
-; SSE2-NEXT:    movq %r11, %rcx
-; SSE2-NEXT:    sbbq %rdx, %rcx
-; SSE2-NEXT:    setb %cl
-; SSE2-NEXT:    cmpq %rax, %r10
-; SSE2-NEXT:    sbbq %r11, %rdx
-; SSE2-NEXT:    sbbb $0, %cl
-; SSE2-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    cmpq %r9, %rax
-; SSE2-NEXT:    movq %rbp, %rcx
-; SSE2-NEXT:    sbbq %r13, %rcx
+; SSE2-NEXT:    movq %xmm8, %rdx
+; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    cmpq %r9, %rdx
+; SSE2-NEXT:    movq %rsi, %rdi
+; SSE2-NEXT:    sbbq %r10, %rdi
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm8
 ; SSE2-NEXT:    setb %dil
-; SSE2-NEXT:    cmpq %rax, %r9
-; SSE2-NEXT:    sbbq %rbp, %r13
+; SSE2-NEXT:    cmpq %rdx, %r9
+; SSE2-NEXT:    movq %xmm7, %rdx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    pextrq $1, %xmm7, %r9
+; SSE2-NEXT:    pand %xmm0, %xmm8
+; SSE2-NEXT:    sbbq %rsi, %r10
+; SSE2-NEXT:    movq %xmm7, %rsi
 ; SSE2-NEXT:    sbbb $0, %dil
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq %r12, %r9
-; SSE2-NEXT:    sbbq %r15, %r9
-; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    sbbb $0, %r11b
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq %r14, %r9
-; SSE2-NEXT:    sbbq %rbx, %r9
-; SSE2-NEXT:    setb %r9b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r14, %rbx
-; SSE2-NEXT:    sbbb $0, %r9b
-; SSE2-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; SSE2-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    cmpq %rdx, %rsi
+; SSE2-NEXT:    movq %r9, %r10
+; SSE2-NEXT:    sbbq %rcx, %r10
+; SSE2-NEXT:    pextrq $1, %xmm8, %r10
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    setb %r15b
+; SSE2-NEXT:    cmpq %rsi, %rdx
+; SSE2-NEXT:    pextrq $1, %xmm7, %rdx
+; SSE2-NEXT:    sbbq %r9, %rcx
+; SSE2-NEXT:    movq %xmm8, %rcx
+; SSE2-NEXT:    movq %xmm7, %rsi
+; SSE2-NEXT:    sbbb $0, %r15b
+; SSE2-NEXT:    cmpq %rcx, %rsi
+; SSE2-NEXT:    movq %rdx, %r9
+; SSE2-NEXT:    sbbq %r10, %r9
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm7
+; SSE2-NEXT:    setb %dil
+; SSE2-NEXT:    cmpq %rsi, %rcx
+; SSE2-NEXT:    movq %xmm6, %rcx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    pextrq $1, %xmm6, %rsi
+; SSE2-NEXT:    pand %xmm0, %xmm7
+; SSE2-NEXT:    sbbq %rdx, %r10
+; SSE2-NEXT:    movq %xmm6, %rdx
+; SSE2-NEXT:    sbbb $0, %dil
+; SSE2-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    cmpq %rcx, %rdx
 ; SSE2-NEXT:    movq %rsi, %r9
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; SSE2-NEXT:    sbbq %rdx, %r9
-; SSE2-NEXT:    setb %r9b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %rsi, %rdx
-; SSE2-NEXT:    sbbb $0, %r9b
-; SSE2-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; SSE2-NEXT:    sbbq %rax, %r9
+; SSE2-NEXT:    pextrq $1, %xmm7, %r9
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    setb %dil
+; SSE2-NEXT:    cmpq %rdx, %rcx
+; SSE2-NEXT:    pextrq $1, %xmm6, %rcx
+; SSE2-NEXT:    sbbq %rsi, %rax
+; SSE2-NEXT:    movq %xmm7, %rax
+; SSE2-NEXT:    movq %xmm6, %rdx
+; SSE2-NEXT:    sbbb $0, %dil
+; SSE2-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    cmpq %rax, %rdx
+; SSE2-NEXT:    movq %rcx, %rsi
+; SSE2-NEXT:    sbbq %r9, %rsi
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm6
+; SSE2-NEXT:    setb %sil
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    movq %xmm5, %rax
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    pextrq $1, %xmm5, %rdx
+; SSE2-NEXT:    pand %xmm0, %xmm6
+; SSE2-NEXT:    sbbq %rcx, %r9
+; SSE2-NEXT:    movq %xmm5, %rcx
+; SSE2-NEXT:    sbbb $0, %sil
+; SSE2-NEXT:    movb %sil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
 ; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    movq %r8, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT:    sbbq %rsi, %rdx
-; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    sbbq %r12, %rsi
+; SSE2-NEXT:    pextrq $1, %xmm6, %rsi
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    setb %dil
 ; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r8, %rsi
-; SSE2-NEXT:    sbbb $0, %dl
-; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    movq %r8, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT:    sbbq %rsi, %rdx
-; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    pextrq $1, %xmm5, %rax
+; SSE2-NEXT:    sbbq %rdx, %r12
+; SSE2-NEXT:    movq %xmm6, %rcx
+; SSE2-NEXT:    movq %xmm5, %rdx
+; SSE2-NEXT:    sbbb $0, %dil
+; SSE2-NEXT:    movb %dil, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    cmpq %rcx, %rdx
+; SSE2-NEXT:    movq %rax, %r12
+; SSE2-NEXT:    sbbq %rsi, %r12
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm5
+; SSE2-NEXT:    setb %r12b
+; SSE2-NEXT:    cmpq %rdx, %rcx
+; SSE2-NEXT:    movq %xmm4, %rcx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    pextrq $1, %xmm4, %rdx
+; SSE2-NEXT:    pand %xmm0, %xmm5
+; SSE2-NEXT:    sbbq %rax, %rsi
+; SSE2-NEXT:    movq %xmm4, %rax
+; SSE2-NEXT:    sbbb $0, %r12b
 ; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r8, %rsi
-; SSE2-NEXT:    sbbb $0, %dl
-; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    sbbq %r14, %rsi
+; SSE2-NEXT:    pextrq $1, %xmm5, %rsi
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT:    pand %xmm0, %xmm4
+; SSE2-NEXT:    setb %r13b
+; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    pextrq $1, %xmm4, %rax
+; SSE2-NEXT:    sbbq %rdx, %r14
+; SSE2-NEXT:    movq %xmm5, %rcx
+; SSE2-NEXT:    movq %xmm4, %rdx
+; SSE2-NEXT:    sbbb $0, %r13b
 ; SSE2-NEXT:    cmpq %rcx, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    movq %r8, %rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; SSE2-NEXT:    sbbq %rsi, %rax
-; SSE2-NEXT:    setb %r9b
+; SSE2-NEXT:    movq %rax, %r14
+; SSE2-NEXT:    sbbq %rsi, %r14
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm4
+; SSE2-NEXT:    setb %r14b
 ; SSE2-NEXT:    cmpq %rdx, %rcx
-; SSE2-NEXT:    sbbq %r8, %rsi
-; SSE2-NEXT:    sbbb $0, %r9b
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    cmpq %rdx, %rsi
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    movq %r8, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    sbbq %rax, %rcx
-; SSE2-NEXT:    setb %cl
-; SSE2-NEXT:    cmpq %rsi, %rdx
-; SSE2-NEXT:    sbbq %r8, %rax
-; SSE2-NEXT:    sbbb $0, %cl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rsi, %r8
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    movq %r10, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    sbbq %rax, %rdx
-; SSE2-NEXT:    setb %dl
-; SSE2-NEXT:    cmpq %r8, %rsi
-; SSE2-NEXT:    sbbq %r10, %rax
-; SSE2-NEXT:    sbbb $0, %dl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    cmpq %r8, %r10
-; SSE2-NEXT:    movq (%rsp), %rbx # 8-byte Reload
-; SSE2-NEXT:    movq %rbx, %rsi
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    movq %xmm3, %rdi
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    pextrq $1, %xmm3, %rdx
+; SSE2-NEXT:    pand %xmm0, %xmm4
 ; SSE2-NEXT:    sbbq %rax, %rsi
-; SSE2-NEXT:    setb %sil
-; SSE2-NEXT:    cmpq %r10, %r8
-; SSE2-NEXT:    sbbq %rbx, %rax
-; SSE2-NEXT:    sbbb $0, %sil
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    cmpq %r10, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    movq %r14, %r8
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; SSE2-NEXT:    movq %xmm3, %rax
+; SSE2-NEXT:    sbbb $0, %r14b
+; SSE2-NEXT:    cmpq %rdi, %rax
+; SSE2-NEXT:    movq %rdx, %rsi
+; SSE2-NEXT:    sbbq %rbx, %rsi
+; SSE2-NEXT:    pextrq $1, %xmm4, %rcx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    cmpq %rax, %rdi
+; SSE2-NEXT:    pextrq $1, %xmm3, %rsi
+; SSE2-NEXT:    sbbq %rdx, %rbx
+; SSE2-NEXT:    movq %xmm4, %rdx
+; SSE2-NEXT:    movq %xmm3, %rdi
+; SSE2-NEXT:    sbbb $0, %bpl
+; SSE2-NEXT:    cmpq %rdx, %rdi
+; SSE2-NEXT:    movq %rsi, %rbx
+; SSE2-NEXT:    sbbq %rcx, %rbx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm3
+; SSE2-NEXT:    setb %bl
+; SSE2-NEXT:    cmpq %rdi, %rdx
+; SSE2-NEXT:    movq %xmm2, %rdi
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    pextrq $1, %xmm2, %rax
+; SSE2-NEXT:    pand %xmm0, %xmm3
+; SSE2-NEXT:    sbbq %rsi, %rcx
+; SSE2-NEXT:    movq %xmm2, %rcx
+; SSE2-NEXT:    sbbb $0, %bl
+; SSE2-NEXT:    cmpq %rdi, %rcx
+; SSE2-NEXT:    movq %rax, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r8, %rdx
+; SSE2-NEXT:    pextrq $1, %xmm3, %rsi
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    cmpq %rcx, %rdi
+; SSE2-NEXT:    pextrq $1, %xmm2, %rdi
 ; SSE2-NEXT:    sbbq %rax, %r8
-; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %rbx, %r10
-; SSE2-NEXT:    sbbq %r14, %rax
-; SSE2-NEXT:    sbbb $0, %r8b
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    cmpq %rbx, %r14
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    movq %r15, %r10
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    sbbq %rax, %r10
+; SSE2-NEXT:    movq %xmm3, %rcx
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    sbbb $0, %r11b
+; SSE2-NEXT:    cmpq %rcx, %rax
+; SSE2-NEXT:    movq %rdi, %rdx
+; SSE2-NEXT:    sbbq %rsi, %rdx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm2
 ; SSE2-NEXT:    setb %r10b
-; SSE2-NEXT:    cmpq %r14, %rbx
-; SSE2-NEXT:    sbbq %r15, %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    movq %xmm1, %r8
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    pextrq $1, %xmm1, %rdx
+; SSE2-NEXT:    pand %xmm0, %xmm2
+; SSE2-NEXT:    sbbq %rdi, %rsi
+; SSE2-NEXT:    movq %xmm1, %rdi
 ; SSE2-NEXT:    sbbb $0, %r10b
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT:    cmpq %r14, %r15
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    sbbq %rax, %rbx
-; SSE2-NEXT:    setb %bl
-; SSE2-NEXT:    cmpq %r15, %r14
-; SSE2-NEXT:    sbbq %r12, %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT:    sbbb $0, %bl
-; SSE2-NEXT:    cmpq %r14, %r15
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT:    movq %r13, %r12
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    sbbq %rax, %r12
-; SSE2-NEXT:    setb %bpl
-; SSE2-NEXT:    cmpq %r15, %r14
-; SSE2-NEXT:    sbbq %r13, %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT:    sbbb $0, %bpl
-; SSE2-NEXT:    cmpq %r14, %r15
+; SSE2-NEXT:    cmpq %r8, %rdi
+; SSE2-NEXT:    movq %rdx, %rsi
 ; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE2-NEXT:    movq %rax, %r12
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r13, %r12
-; SSE2-NEXT:    setb %r12b
-; SSE2-NEXT:    cmpq %r15, %r14
-; SSE2-NEXT:    sbbq %rax, %r13
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r15d # 1-byte Folded Reload
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r13d # 1-byte Folded Reload
-; SSE2-NEXT:    movzbl %dil, %r14d
-; SSE2-NEXT:    movd %r13d, %xmm0
-; SSE2-NEXT:    movzbl %r11b, %edi
-; SSE2-NEXT:    sbbb $0, %r12b
-; SSE2-NEXT:    movzbl %r12b, %r11d
-; SSE2-NEXT:    pinsrb $1, %r15d, %xmm0
-; SSE2-NEXT:    pinsrb $2, %r14d, %xmm0
+; SSE2-NEXT:    sbbq %rax, %rsi
+; SSE2-NEXT:    pextrq $1, %xmm2, %rcx
+; SSE2-NEXT:    movdqu {{[0-9]+}}(%rsp), %xmm1
+; SSE2-NEXT:    pand %xmm0, %xmm1
+; SSE2-NEXT:    setb %r9b
+; SSE2-NEXT:    cmpq %rdi, %r8
+; SSE2-NEXT:    pextrq $1, %xmm1, %rsi
+; SSE2-NEXT:    sbbq %rdx, %rax
+; SSE2-NEXT:    movq %xmm2, %rax
+; SSE2-NEXT:    movq %xmm1, %rdi
+; SSE2-NEXT:    sbbb $0, %r9b
+; SSE2-NEXT:    cmpq %rax, %rdi
+; SSE2-NEXT:    movq %rsi, %rdx
+; SSE2-NEXT:    sbbq %rcx, %rdx
+; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    cmpq %rdi, %rax
+; SSE2-NEXT:    sbbq %rsi, %rcx
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r8d # 1-byte Folded Reload
+; SSE2-NEXT:    movd %ecx, %xmm0
+; SSE2-NEXT:    movzbl %r15b, %edi
+; SSE2-NEXT:    sbbb $0, %dl
+; SSE2-NEXT:    movzbl %dl, %ecx
+; SSE2-NEXT:    pinsrb $1, %eax, %xmm0
+; SSE2-NEXT:    pinsrb $2, %r8d, %xmm0
 ; SSE2-NEXT:    pinsrb $3, %edi, %xmm0
-; SSE2-NEXT:    andl $3, %r11d
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    movb %r11b, 4(%r12)
-; SSE2-NEXT:    movd %xmm0, %r11d
-; SSE2-NEXT:    andl $3, %r11d
-; SSE2-NEXT:    andl $3, %r15d
-; SSE2-NEXT:    leal (%r11,%r15,4), %r11d
-; SSE2-NEXT:    andl $3, %r14d
-; SSE2-NEXT:    shll $4, %r14d
-; SSE2-NEXT:    orl %r11d, %r14d
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    movb %cl, 4(%r15)
+; SSE2-NEXT:    movd %xmm0, %ecx
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    andl $3, %eax
+; SSE2-NEXT:    leal (%rcx,%rax,4), %eax
+; SSE2-NEXT:    andl $3, %r8d
+; SSE2-NEXT:    shll $4, %r8d
+; SSE2-NEXT:    orl %eax, %r8d
 ; SSE2-NEXT:    andl $3, %edi
 ; SSE2-NEXT:    shll $6, %edi
-; SSE2-NEXT:    orl %r14d, %edi
-; SSE2-NEXT:    movzbl %bpl, %r11d
-; SSE2-NEXT:    andl $3, %r11d
-; SSE2-NEXT:    shll $8, %r11d
-; SSE2-NEXT:    orl %edi, %r11d
-; SSE2-NEXT:    movzbl %bl, %edi
-; SSE2-NEXT:    andl $3, %edi
-; SSE2-NEXT:    shll $10, %edi
-; SSE2-NEXT:    orl %r11d, %edi
-; SSE2-NEXT:    movzbl %r8b, %r8d
-; SSE2-NEXT:    movzbl %r10b, %r10d
-; SSE2-NEXT:    andl $3, %r10d
-; SSE2-NEXT:    shll $12, %r10d
-; SSE2-NEXT:    andl $3, %r8d
-; SSE2-NEXT:    shll $14, %r8d
-; SSE2-NEXT:    orl %r10d, %r8d
-; SSE2-NEXT:    movzbl %sil, %esi
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shll $16, %esi
-; SSE2-NEXT:    orl %r8d, %esi
-; SSE2-NEXT:    movzbl %dl, %edx
+; SSE2-NEXT:    orl %r8d, %edi
+; SSE2-NEXT:    movzbl %r9b, %ecx
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    shll $8, %ecx
+; SSE2-NEXT:    orl %edi, %ecx
+; SSE2-NEXT:    movzbl %r10b, %eax
+; SSE2-NEXT:    andl $3, %eax
+; SSE2-NEXT:    shll $10, %eax
+; SSE2-NEXT:    orl %ecx, %eax
+; SSE2-NEXT:    movzbl %bl, %ecx
+; SSE2-NEXT:    movzbl %r11b, %edx
 ; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $18, %edx
-; SSE2-NEXT:    orl %esi, %edx
-; SSE2-NEXT:    movzbl %cl, %ecx
+; SSE2-NEXT:    shll $12, %edx
 ; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shll $20, %ecx
+; SSE2-NEXT:    shll $14, %ecx
 ; SSE2-NEXT:    orl %edx, %ecx
-; SSE2-NEXT:    movzbl %r9b, %edx
+; SSE2-NEXT:    movzbl %bpl, %edx
 ; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $22, %edx
+; SSE2-NEXT:    shll $16, %edx
 ; SSE2-NEXT:    orl %ecx, %edx
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %eax
-; SSE2-NEXT:    shll $24, %eax
-; SSE2-NEXT:    orl %edx, %eax
-; SSE2-NEXT:    orl %edi, %eax
+; SSE2-NEXT:    movzbl %r14b, %ecx
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    shll $18, %ecx
+; SSE2-NEXT:    orl %edx, %ecx
+; SSE2-NEXT:    movzbl %r13b, %edx
+; SSE2-NEXT:    andl $3, %edx
+; SSE2-NEXT:    shll $20, %edx
+; SSE2-NEXT:    orl %ecx, %edx
+; SSE2-NEXT:    movzbl %r12b, %esi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    shll $22, %esi
+; SSE2-NEXT:    orl %edx, %esi
 ; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %ecx
+; SSE2-NEXT:    shll $24, %ecx
+; SSE2-NEXT:    orl %esi, %ecx
+; SSE2-NEXT:    orl %eax, %ecx
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
 ; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
 ; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    shlq $26, %rdx
-; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shlq $28, %rcx
-; SSE2-NEXT:    orq %rdx, %rcx
+; SSE2-NEXT:    andl $3, %eax
+; SSE2-NEXT:    shlq $28, %rax
+; SSE2-NEXT:    orq %rdx, %rax
 ; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
 ; SSE2-NEXT:    andl $3, %edx
 ; SSE2-NEXT:    shlq $30, %rdx
-; SSE2-NEXT:    orq %rcx, %rdx
 ; SSE2-NEXT:    orq %rax, %rdx
-; SSE2-NEXT:    movq %r12, %rax
-; SSE2-NEXT:    movl %edx, (%r12)
-; SSE2-NEXT:    addq $96, %rsp
+; SSE2-NEXT:    orq %rcx, %rdx
+; SSE2-NEXT:    movq %r15, %rax
+; SSE2-NEXT:    movl %edx, (%r15)
 ; SSE2-NEXT:    popq %rbx
 ; SSE2-NEXT:    popq %r12
 ; SSE2-NEXT:    popq %r13
@@ -2255,319 +2251,272 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    pushq %r13
 ; AVX2-NEXT:    pushq %r12
 ; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $88, %rsp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    vmovdqa {{.*#+}} xmm0 = [18446744073709551615,127]
 ; AVX2-NEXT:    andl $127, %r8d
 ; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX2-NEXT:    andl $127, %edx
 ; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT:    andl $127, %r15d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX2-NEXT:    andl $127, %r14d
 ; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r10
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX2-NEXT:    vpextrq $1, %xmm4, %r11
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX2-NEXT:    vpextrq $1, %xmm6, %rbx
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm7
+; AVX2-NEXT:    vpextrq $1, %xmm7, %r14
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
 ; AVX2-NEXT:    andl $127, %edx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX2-NEXT:    andl $127, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    andl $127, %r8d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    andl $127, %r12d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    andl $127, %r13d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %rbx, %r11
-; AVX2-NEXT:    movq %r13, %r10
-; AVX2-NEXT:    sbbq %r12, %r10
-; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r11, %rbx
-; AVX2-NEXT:    sbbq %r13, %r12
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %r10, %r11
-; AVX2-NEXT:    movq %r8, %rbx
-; AVX2-NEXT:    sbbq %rbp, %rbx
-; AVX2-NEXT:    setb %bl
-; AVX2-NEXT:    cmpq %r11, %r10
-; AVX2-NEXT:    sbbq %r8, %rbp
-; AVX2-NEXT:    sbbb $0, %bl
-; AVX2-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    cmpq %r8, %r10
-; AVX2-NEXT:    movq %rdx, %r11
-; AVX2-NEXT:    sbbq %r14, %r11
-; AVX2-NEXT:    setb %r11b
-; AVX2-NEXT:    cmpq %r10, %r8
-; AVX2-NEXT:    sbbq %rdx, %r14
-; AVX2-NEXT:    sbbb $0, %r11b
-; AVX2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    cmpq %rdx, %r8
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    sbbq %r15, %r10
-; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r8, %rdx
+; AVX2-NEXT:    vpextrq $1, %xmm8, %r15
+; AVX2-NEXT:    vmovq %xmm8, %r12
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
+; AVX2-NEXT:    vpextrq $1, %xmm8, %rax
+; AVX2-NEXT:    vmovq %xmm8, %rbp
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
+; AVX2-NEXT:    cmpq %r12, %rbp
+; AVX2-NEXT:    movq %rax, %r13
+; AVX2-NEXT:    sbbq %r15, %r13
+; AVX2-NEXT:    vpextrq $1, %xmm8, %r13
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rbp, %r12
+; AVX2-NEXT:    vmovq %xmm8, %r12
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm8
+; AVX2-NEXT:    vpextrq $1, %xmm8, %rbp
 ; AVX2-NEXT:    sbbq %rax, %r15
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    vmovq %xmm8, %rax
+; AVX2-NEXT:    cmpq %r12, %rax
+; AVX2-NEXT:    movq %rbp, %r15
+; AVX2-NEXT:    sbbq %r13, %r15
 ; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    cmpq %rax, %r12
+; AVX2-NEXT:    vmovq %xmm7, %rax
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm7
+; AVX2-NEXT:    vpextrq $1, %xmm7, %r15
+; AVX2-NEXT:    sbbq %rbp, %r13
 ; AVX2-NEXT:    sbbb $0, %r8b
 ; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    vmovq %xmm7, %r12
+; AVX2-NEXT:    cmpq %rax, %r12
+; AVX2-NEXT:    movq %r15, %r13
+; AVX2-NEXT:    sbbq %r14, %r13
+; AVX2-NEXT:    vmovq %xmm6, %r13
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
 ; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    cmpq %r12, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm6, %rax
+; AVX2-NEXT:    vmovq %xmm6, %r12
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm6, %r14
 ; AVX2-NEXT:    sbbb $0, %r8b
 ; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    cmpq %r13, %r12
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    sbbq %rbx, %r15
 ; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    cmpq %r12, %r13
+; AVX2-NEXT:    vmovq %xmm6, %r15
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX2-NEXT:    vpextrq $1, %xmm6, %r12
+; AVX2-NEXT:    sbbq %rax, %rbx
+; AVX2-NEXT:    vmovq %xmm6, %rax
 ; AVX2-NEXT:    sbbb $0, %r8b
 ; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r12b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r12b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    cmpq %r15, %rax
+; AVX2-NEXT:    movq %r12, %rbx
+; AVX2-NEXT:    sbbq %r14, %rbx
+; AVX2-NEXT:    vpextrq $1, %xmm5, %rbx
 ; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    cmpq %rax, %r15
+; AVX2-NEXT:    vmovq %xmm5, %rax
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX2-NEXT:    vpextrq $1, %xmm5, %r15
+; AVX2-NEXT:    sbbq %r12, %r14
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    vmovq %xmm5, %r14
+; AVX2-NEXT:    cmpq %rax, %r14
+; AVX2-NEXT:    movq %r15, %r12
+; AVX2-NEXT:    sbbq %rbx, %r12
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %r14, %rax
+; AVX2-NEXT:    vmovq %xmm4, %rax
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX2-NEXT:    vpextrq $1, %xmm4, %r14
+; AVX2-NEXT:    sbbq %r15, %rbx
 ; AVX2-NEXT:    sbbb $0, %r8b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    cmpq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    movq %rbx, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r11, %rdx
-; AVX2-NEXT:    setb %dl
-; AVX2-NEXT:    cmpq %r10, %rax
-; AVX2-NEXT:    sbbq %rbx, %r11
-; AVX2-NEXT:    sbbb $0, %dl
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %rax, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    vmovq %xmm4, %rbx
+; AVX2-NEXT:    cmpq %rax, %rbx
+; AVX2-NEXT:    movq %r14, %r15
+; AVX2-NEXT:    sbbq %r11, %r15
+; AVX2-NEXT:    vmovq %xmm3, %r15
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rbx, %rax
+; AVX2-NEXT:    vpextrq $1, %xmm3, %rax
+; AVX2-NEXT:    vmovq %xmm3, %rbx
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT:    sbbq %r14, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r11
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    cmpq %r15, %rbx
+; AVX2-NEXT:    movq %rax, %r14
+; AVX2-NEXT:    sbbq %r10, %r14
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rbx, %r15
+; AVX2-NEXT:    vmovq %xmm3, %rbx
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX2-NEXT:    vpextrq $1, %xmm3, %r14
+; AVX2-NEXT:    sbbq %rax, %r10
+; AVX2-NEXT:    vmovq %xmm3, %rax
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    cmpq %rbx, %rax
 ; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    sbbq %rbx, %r10
+; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r15
 ; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r11, %rax
-; AVX2-NEXT:    sbbq %r14, %rbx
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
 ; AVX2-NEXT:    cmpq %rax, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %r11
-; AVX2-NEXT:    movq (%rsp), %r14 # 8-byte Reload
+; AVX2-NEXT:    vmovq %xmm2, %rax
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rbx
 ; AVX2-NEXT:    sbbq %r14, %r11
+; AVX2-NEXT:    sbbb $0, %r10b
+; AVX2-NEXT:    vmovq %xmm2, %r14
+; AVX2-NEXT:    cmpq %rax, %r14
+; AVX2-NEXT:    movq %rbx, %r11
+; AVX2-NEXT:    sbbq %r15, %r11
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rbp
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
 ; AVX2-NEXT:    setb %r11b
-; AVX2-NEXT:    cmpq %rbx, %rax
-; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r12
+; AVX2-NEXT:    cmpq %r14, %rax
+; AVX2-NEXT:    sbbq %rbx, %r15
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r14
 ; AVX2-NEXT:    sbbb $0, %r11b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
 ; AVX2-NEXT:    cmpq %rax, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %rbx
+; AVX2-NEXT:    movq %r12, %rbx
+; AVX2-NEXT:    sbbq %rbp, %rbx
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
 ; AVX2-NEXT:    setb %bl
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r15
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r13
 ; AVX2-NEXT:    cmpq %r14, %rax
-; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    sbbq %r12, %rbp
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r14
 ; AVX2-NEXT:    sbbb $0, %bl
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    cmpq %r9, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    cmpq %rax, %r14
+; AVX2-NEXT:    movq %r13, %r12
+; AVX2-NEXT:    sbbq %r15, %r12
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
 ; AVX2-NEXT:    setb %bpl
-; AVX2-NEXT:    cmpq %rax, %r9
+; AVX2-NEXT:    cmpq %r14, %rax
 ; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    vmovq %xmm2, %r15
 ; AVX2-NEXT:    sbbb $0, %bpl
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    cmpq %rax, %r15
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm2, %r13
+; AVX2-NEXT:    movq %r13, %r14
+; AVX2-NEXT:    sbbq %r12, %r14
+; AVX2-NEXT:    setb %r14b
+; AVX2-NEXT:    cmpq %r15, %rax
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    sbbq %r13, %r12
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX2-NEXT:    sbbb $0, %r14b
+; AVX2-NEXT:    vmovq %xmm1, %r12
+; AVX2-NEXT:    cmpq %r9, %r12
+; AVX2-NEXT:    movq %rax, %r15
+; AVX2-NEXT:    sbbq %rdx, %r15
+; AVX2-NEXT:    setb %r15b
+; AVX2-NEXT:    cmpq %r12, %r9
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    vpextrq $1, %xmm1, %r12
+; AVX2-NEXT:    sbbq %rax, %rdx
+; AVX2-NEXT:    vmovq %xmm1, %rax
+; AVX2-NEXT:    sbbb $0, %r15b
 ; AVX2-NEXT:    cmpq %rsi, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %r9
+; AVX2-NEXT:    movq %r12, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; AVX2-NEXT:    sbbq %rdx, %r8
 ; AVX2-NEXT:    setb %r9b
 ; AVX2-NEXT:    cmpq %rax, %rsi
-; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX2-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX2-NEXT:    sbbq %r12, %rdx
+; AVX2-NEXT:    vmovq %xmm2, %rdx
 ; AVX2-NEXT:    sbbb $0, %r9b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    cmpq %rcx, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %rsi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %rsi
-; AVX2-NEXT:    setb %sil
-; AVX2-NEXT:    cmpq %rax, %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    sbbb $0, %sil
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    cmpq %rax, %rcx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    setb %r14b
-; AVX2-NEXT:    cmpq %rcx, %rax
-; AVX2-NEXT:    sbbq %r13, %r15
+; AVX2-NEXT:    cmpq %rcx, %rdx
+; AVX2-NEXT:    movq %rax, %rsi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r12, %rsi
+; AVX2-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX2-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rdx, %rcx
+; AVX2-NEXT:    vpextrq $1, %xmm0, %rcx
+; AVX2-NEXT:    vmovq %xmm1, %rdx
+; AVX2-NEXT:    sbbq %rax, %r12
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    vmovq %xmm0, %rax
+; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    movq %rcx, %r12
+; AVX2-NEXT:    sbbq %rsi, %r12
+; AVX2-NEXT:    setb %r12b
+; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    sbbq %rcx, %rsi
 ; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    sbbb $0, %r14b
-; AVX2-NEXT:    movzbl %r14b, %ecx
+; AVX2-NEXT:    sbbb $0, %r12b
+; AVX2-NEXT:    movzbl %r12b, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    movb %cl, 4(%rdi)
-; AVX2-NEXT:    movzbl %sil, %ecx
+; AVX2-NEXT:    movzbl %r8b, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    movzbl %r9b, %esi
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    leaq (%rsi,%rcx,4), %rcx
-; AVX2-NEXT:    movzbl %bpl, %esi
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    shll $4, %esi
-; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    movzbl %bl, %ecx
+; AVX2-NEXT:    movzbl %r9b, %edx
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    leaq (%rdx,%rcx,4), %rcx
+; AVX2-NEXT:    movzbl %r15b, %edx
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    shll $4, %edx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    movzbl %r14b, %ecx
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
+; AVX2-NEXT:    movzbl %bpl, %edx
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    shll $8, %edx
+; AVX2-NEXT:    orq %rcx, %rdx
+; AVX2-NEXT:    movzbl %bl, %ecx
+; AVX2-NEXT:    andl $3, %ecx
+; AVX2-NEXT:    shll $10, %ecx
 ; AVX2-NEXT:    movzbl %r11b, %esi
 ; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    shll $8, %esi
+; AVX2-NEXT:    shll $12, %esi
 ; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    movzbl %r10b, %ecx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    movzbl %dl, %edx
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    movzbl %r8b, %edi
+; AVX2-NEXT:    movzbl %r10b, %edi
 ; AVX2-NEXT:    andl $3, %edi
 ; AVX2-NEXT:    shll $14, %edi
-; AVX2-NEXT:    orq %rdx, %rdi
-; AVX2-NEXT:    movzbl %r12b, %ecx
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; AVX2-NEXT:    andl $3, %ecx
 ; AVX2-NEXT:    shll $16, %ecx
 ; AVX2-NEXT:    orq %rdi, %rcx
-; AVX2-NEXT:    orq %rsi, %rcx
+; AVX2-NEXT:    orq %rdx, %rcx
 ; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
 ; AVX2-NEXT:    andl $3, %edx
 ; AVX2-NEXT:    shll $18, %edx
@@ -2597,7 +2546,6 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX2-NEXT:    orq %rcx, %rsi
 ; AVX2-NEXT:    orq %rdx, %rsi
 ; AVX2-NEXT:    movl %esi, (%rax)
-; AVX2-NEXT:    addq $88, %rsp
 ; AVX2-NEXT:    popq %rbx
 ; AVX2-NEXT:    popq %r12
 ; AVX2-NEXT:    popq %r13
@@ -2614,318 +2562,277 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    pushq %r13
 ; AVX512-NEXT:    pushq %r12
 ; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $88, %rsp
 ; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %r8, %r15
 ; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rdx, %r14
 ; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
 ; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andl $127, %r8d
-; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andl $127, %edx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX512-NEXT:    andl $127, %ebp
+; AVX512-NEXT:    vmovdqa {{.*#+}} xmm0 = [18446744073709551615,127]
+; AVX512-NEXT:    andl $127, %r15d
+; AVX512-NEXT:    andl $127, %r14d
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rcx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rdx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
 ; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    vpextrq $1, %xmm6, %rsi
+; AVX512-NEXT:    vmovq %xmm6, %rdi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
 ; AVX512-NEXT:    andl $127, %r12d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX512-NEXT:    andl $127, %r13d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512-NEXT:    andl $127, %r15d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    andl $127, %r10d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT:    andl $127, %ebx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    andl $127, %r8d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT:    andl $127, %r9d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT:    andl $127, %esi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT:    andl $127, %edi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    andl $127, %edx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    cmpq %r14, %r11
-; AVX512-NEXT:    movq %rdx, %rcx
-; AVX512-NEXT:    sbbq %rax, %rcx
-; AVX512-NEXT:    setb %cl
-; AVX512-NEXT:    cmpq %r11, %r14
-; AVX512-NEXT:    sbbq %rdx, %rax
-; AVX512-NEXT:    sbbb $0, %cl
-; AVX512-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %rdi, %rdx
-; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r9, %rdx
-; AVX512-NEXT:    sbbq %r8, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r9, %r8
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %rbx, %rdx
-; AVX512-NEXT:    sbbq %r10, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rbx, %r10
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r15, %rdx
-; AVX512-NEXT:    sbbq %r13, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r15, %r13
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r12, %rdx
-; AVX512-NEXT:    sbbq %rbp, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r12, %rbp
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    sbbq %rsi, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm6, %r8
+; AVX512-NEXT:    vmovq %xmm6, %r11
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm6
+; AVX512-NEXT:    cmpq %rdi, %r11
+; AVX512-NEXT:    movq %r8, %rbx
+; AVX512-NEXT:    sbbq %rsi, %rbx
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %r11, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm6, %rdi
+; AVX512-NEXT:    vmovq %xmm5, %r11
+; AVX512-NEXT:    sbbq %r8, %rsi
+; AVX512-NEXT:    vmovq %xmm6, %rsi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    cmpq %r11, %rsi
+; AVX512-NEXT:    movq %rdi, %r8
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm5, %r8
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %rsi, %r11
+; AVX512-NEXT:    vmovq %xmm5, %rsi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm5
+; AVX512-NEXT:    vpextrq $1, %xmm5, %rbx
+; AVX512-NEXT:    sbbq %rdi, %rdx
+; AVX512-NEXT:    vmovq %xmm5, %rdx
+; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    cmpq %rsi, %rdx
+; AVX512-NEXT:    movq %rbx, %rdi
+; AVX512-NEXT:    sbbq %r8, %rdi
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    vmovq %xmm4, %rdx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rsi
+; AVX512-NEXT:    sbbq %rbx, %r8
+; AVX512-NEXT:    vmovq %xmm4, %rdi
+; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    cmpq %rdx, %rdi
+; AVX512-NEXT:    movq %rsi, %r8
+; AVX512-NEXT:    sbbq %rcx, %r8
+; AVX512-NEXT:    vpextrq $1, %xmm3, %r8
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm4
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %rdi, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm4, %rdx
+; AVX512-NEXT:    vmovq %xmm3, %rdi
+; AVX512-NEXT:    sbbq %rsi, %rcx
+; AVX512-NEXT:    vmovq %xmm4, %rcx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm3
+; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    cmpq %rdi, %rcx
+; AVX512-NEXT:    movq %rdx, %rsi
+; AVX512-NEXT:    sbbq %r8, %rsi
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %rcx, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm3, %rcx
+; AVX512-NEXT:    vmovq %xmm2, %rsi
+; AVX512-NEXT:    sbbq %rdx, %r8
+; AVX512-NEXT:    vmovq %xmm3, %rdx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    movb %r9b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    cmpq %rsi, %rdx
+; AVX512-NEXT:    movq %rcx, %rdi
+; AVX512-NEXT:    sbbq %rax, %rdi
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rdi
+; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    vmovq %xmm2, %rdx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512-NEXT:    sbbq %rcx, %rax
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    sbbb $0, %r8b
+; AVX512-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    cmpq %rdx, %rax
+; AVX512-NEXT:    movq %rsi, %rcx
+; AVX512-NEXT:    sbbq %rdi, %rcx
+; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    cmpq %rax, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT:    sbbq %rsi, %rdi
+; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    vmovq %xmm2, %rsi
+; AVX512-NEXT:    sbbb $0, %r8b
+; AVX512-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    movq %rcx, %rdi
+; AVX512-NEXT:    sbbq %rax, %rdi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
 ; AVX512-NEXT:    setb %r13b
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rdi, %rsi
+; AVX512-NEXT:    cmpq %rsi, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rsi
+; AVX512-NEXT:    sbbq %rcx, %rax
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    vmovq %xmm2, %rcx
 ; AVX512-NEXT:    sbbb $0, %r13b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    sbbq %rsi, %rdx
+; AVX512-NEXT:    movq %rsi, %rdi
+; AVX512-NEXT:    sbbq %rdx, %rdi
 ; AVX512-NEXT:    setb %bpl
 ; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rdi, %rsi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rax
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    sbbq %rsi, %rdx
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT:    vmovq %xmm1, %rdx
 ; AVX512-NEXT:    sbbb $0, %bpl
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    cmpq %rcx, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    sbbq %rsi, %rax
-; AVX512-NEXT:    setb %r9b
-; AVX512-NEXT:    cmpq %rdx, %rcx
-; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %r9b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT:    vmovq %xmm2, %rsi
 ; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rcx
-; AVX512-NEXT:    setb %cl
+; AVX512-NEXT:    movq %rcx, %rdi
+; AVX512-NEXT:    sbbq %rax, %rdi
+; AVX512-NEXT:    setb %bl
 ; AVX512-NEXT:    cmpq %rsi, %rdx
-; AVX512-NEXT:    sbbq %rdi, %rax
-; AVX512-NEXT:    sbbb $0, %cl
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT:    cmpq %rsi, %rdi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    movq %r8, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    sbbq %rcx, %rax
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    vmovq %xmm2, %rcx
+; AVX512-NEXT:    sbbb $0, %bl
+; AVX512-NEXT:    cmpq %rsi, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, %rdi
+; AVX512-NEXT:    sbbq %rdx, %rdi
+; AVX512-NEXT:    setb %r11b
+; AVX512-NEXT:    cmpq %rcx, %rsi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rcx
 ; AVX512-NEXT:    sbbq %rax, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rdi, %rsi
-; AVX512-NEXT:    sbbq %r8, %rax
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    cmpq %rdi, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    movq %r10, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rsi
-; AVX512-NEXT:    setb %sil
-; AVX512-NEXT:    cmpq %r8, %rdi
-; AVX512-NEXT:    sbbq %r10, %rax
-; AVX512-NEXT:    sbbb $0, %sil
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    cmpq %r8, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    movq %r11, %rdi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rdi
-; AVX512-NEXT:    setb %dil
-; AVX512-NEXT:    cmpq %r10, %r8
-; AVX512-NEXT:    sbbq %r11, %rax
-; AVX512-NEXT:    sbbb $0, %dil
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    cmpq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    movq %rbx, %r8
-; AVX512-NEXT:    movq (%rsp), %r11 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r11, %r8
-; AVX512-NEXT:    setb %r8b
-; AVX512-NEXT:    cmpq %r10, %rax
-; AVX512-NEXT:    sbbq %rbx, %r11
-; AVX512-NEXT:    sbbb $0, %r8b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    cmpq %rbx, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %r10
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    vmovq %xmm1, %rdx
+; AVX512-NEXT:    sbbb $0, %r11b
+; AVX512-NEXT:    vmovq %xmm2, %rsi
+; AVX512-NEXT:    cmpq %rdx, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rax
+; AVX512-NEXT:    movq %rax, %rdi
+; AVX512-NEXT:    sbbq %rcx, %rdi
 ; AVX512-NEXT:    setb %r10b
-; AVX512-NEXT:    cmpq %r11, %rbx
-; AVX512-NEXT:    sbbq %r14, %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    cmpq %rsi, %rdx
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    sbbq %rax, %rcx
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rcx
 ; AVX512-NEXT:    sbbb $0, %r10b
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    cmpq %r15, %r11
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    vmovq %xmm2, %rax
+; AVX512-NEXT:    cmpq %rsi, %rax
+; AVX512-NEXT:    movq %rcx, %rdi
+; AVX512-NEXT:    sbbq %rdx, %rdi
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %rax, %rsi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    sbbq %rcx, %rdx
+; AVX512-NEXT:    vmovq %xmm1, %rsi
+; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; AVX512-NEXT:    cmpq %rcx, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdx
+; AVX512-NEXT:    movq %rdx, %rax
+; AVX512-NEXT:    sbbq %r12, %rax
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rdi
+; AVX512-NEXT:    cmpq %rsi, %rcx
+; AVX512-NEXT:    sbbq %rdx, %r12
+; AVX512-NEXT:    vmovq %xmm1, %rcx
+; AVX512-NEXT:    sbbb $0, %r8b
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r14, %rbx
-; AVX512-NEXT:    setb %bl
-; AVX512-NEXT:    cmpq %r11, %r15
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    sbbq %rax, %r14
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT:    sbbb $0, %bl
-; AVX512-NEXT:    cmpq %r11, %r14
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq %rdi, %rdx
+; AVX512-NEXT:    sbbq %r14, %rdx
+; AVX512-NEXT:    setb %r12b
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm2
+; AVX512-NEXT:    vpextrq $1, %xmm2, %rcx
+; AVX512-NEXT:    sbbq %rdi, %r14
+; AVX512-NEXT:    vmovq %xmm2, %rdx
+; AVX512-NEXT:    sbbb $0, %r12b
 ; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r12, %r15
+; AVX512-NEXT:    cmpq %rax, %rdx
+; AVX512-NEXT:    movq %rcx, %rsi
+; AVX512-NEXT:    sbbq %r15, %rsi
+; AVX512-NEXT:    vpextrq $1, %xmm1, %rsi
+; AVX512-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm0
+; AVX512-NEXT:    setb %r14b
+; AVX512-NEXT:    cmpq %rdx, %rax
+; AVX512-NEXT:    vpextrq $1, %xmm0, %rdx
+; AVX512-NEXT:    vmovq %xmm1, %rax
+; AVX512-NEXT:    sbbq %rcx, %r15
+; AVX512-NEXT:    vmovq %xmm0, %rcx
+; AVX512-NEXT:    sbbb $0, %r14b
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq %rdx, %r15
+; AVX512-NEXT:    sbbq %rsi, %r15
 ; AVX512-NEXT:    setb %r15b
-; AVX512-NEXT:    cmpq %r14, %r11
-; AVX512-NEXT:    sbbq %rax, %r12
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %rdx, %rsi
 ; AVX512-NEXT:    sbbb $0, %r15b
-; AVX512-NEXT:    movzbl %r15b, %r11d
-; AVX512-NEXT:    andl $3, %r11d
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    movb %r11b, 4(%r14)
-; AVX512-NEXT:    movzbl %bl, %r11d
-; AVX512-NEXT:    andl $3, %r11d
-; AVX512-NEXT:    movzbl %r10b, %r10d
-; AVX512-NEXT:    andl $3, %r10d
-; AVX512-NEXT:    leaq (%r10,%r11,4), %r10
-; AVX512-NEXT:    movzbl %r8b, %r8d
-; AVX512-NEXT:    andl $3, %r8d
-; AVX512-NEXT:    shll $4, %r8d
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    movzbl %dil, %edi
-; AVX512-NEXT:    andl $3, %edi
-; AVX512-NEXT:    shll $6, %edi
-; AVX512-NEXT:    orq %r8, %rdi
-; AVX512-NEXT:    movzbl %sil, %esi
-; AVX512-NEXT:    andl $3, %esi
-; AVX512-NEXT:    shll $8, %esi
-; AVX512-NEXT:    orq %rdi, %rsi
-; AVX512-NEXT:    movzbl %dl, %edx
-; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $10, %edx
-; AVX512-NEXT:    movzbl %cl, %ecx
+; AVX512-NEXT:    movzbl %r15b, %eax
+; AVX512-NEXT:    andl $3, %eax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    movb %al, 4(%r15)
+; AVX512-NEXT:    movzbl %r14b, %eax
+; AVX512-NEXT:    andl $3, %eax
+; AVX512-NEXT:    movzbl %r12b, %ecx
 ; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    shll $12, %ecx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    movzbl %r9b, %edx
+; AVX512-NEXT:    leaq (%rcx,%rax,4), %rax
+; AVX512-NEXT:    movzbl %r8b, %ecx
+; AVX512-NEXT:    andl $3, %ecx
+; AVX512-NEXT:    shll $4, %ecx
+; AVX512-NEXT:    orq %rax, %rcx
+; AVX512-NEXT:    movzbl %r9b, %eax
+; AVX512-NEXT:    andl $3, %eax
+; AVX512-NEXT:    shll $6, %eax
+; AVX512-NEXT:    orq %rcx, %rax
+; AVX512-NEXT:    movzbl %r10b, %ecx
+; AVX512-NEXT:    andl $3, %ecx
+; AVX512-NEXT:    shll $8, %ecx
+; AVX512-NEXT:    orq %rax, %rcx
+; AVX512-NEXT:    movzbl %r11b, %eax
+; AVX512-NEXT:    andl $3, %eax
+; AVX512-NEXT:    shll $10, %eax
+; AVX512-NEXT:    movzbl %bl, %edx
 ; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $14, %edx
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movzbl %bpl, %eax
+; AVX512-NEXT:    shll $12, %edx
+; AVX512-NEXT:    orq %rax, %rdx
+; AVX512-NEXT:    movzbl %bpl, %esi
+; AVX512-NEXT:    andl $3, %esi
+; AVX512-NEXT:    shll $14, %esi
+; AVX512-NEXT:    orq %rdx, %rsi
+; AVX512-NEXT:    movzbl %r13b, %eax
 ; AVX512-NEXT:    andl $3, %eax
 ; AVX512-NEXT:    shll $16, %eax
-; AVX512-NEXT:    orq %rdx, %rax
 ; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    movzbl %r13b, %ecx
+; AVX512-NEXT:    orq %rcx, %rax
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; AVX512-NEXT:    andl $3, %ecx
 ; AVX512-NEXT:    shll $18, %ecx
 ; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
@@ -2953,9 +2860,8 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; AVX512-NEXT:    shlq $30, %rdx
 ; AVX512-NEXT:    orq %rax, %rdx
 ; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    movl %edx, (%r14)
-; AVX512-NEXT:    addq $88, %rsp
+; AVX512-NEXT:    movq %r15, %rax
+; AVX512-NEXT:    movl %edx, (%r15)
 ; AVX512-NEXT:    popq %rbx
 ; AVX512-NEXT:    popq %r12
 ; AVX512-NEXT:    popq %r13

>From c58c6a0aadf249e3795653c4e8060bd0bfe0d156 Mon Sep 17 00:00:00 2001
From: stomfaig <stomfaig at gmail.com>
Date: Fri, 13 Mar 2026 10:50:35 +0000
Subject: [PATCH 8/8] update test

---
 llvm/test/CodeGen/X86/pr173924.ll | 33 +++++++++++++++----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr173924.ll b/llvm/test/CodeGen/X86/pr173924.ll
index 17c048c05a7de..a25f62a0ab071 100644
--- a/llvm/test/CodeGen/X86/pr173924.ll
+++ b/llvm/test/CodeGen/X86/pr173924.ll
@@ -6,30 +6,29 @@ define i256 @PR173924(<8 x i256> %a0) {
 ; CHECK-LABEL: PR173924:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq %rdi, %rax
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r8
 ; CHECK-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; CHECK-NEXT:    andl $1, %r10d
-; CHECK-NEXT:    andl $1, %r9d
-; CHECK-NEXT:    addq %r10, %r9
 ; CHECK-NEXT:    vmovd {{.*#+}} xmm0 = [1,0,0,0]
-; CHECK-NEXT:    vpand {{[0-9]+}}(%rsp), %xmm0, %xmm1
-; CHECK-NEXT:    vmovq %xmm1, %r10
-; CHECK-NEXT:    andl $1, %edx
-; CHECK-NEXT:    addq %r10, %rdx
-; CHECK-NEXT:    addq %r9, %rdx
-; CHECK-NEXT:    andl $1, %r8d
+; CHECK-NEXT:    vpand {{[0-9]+}}(%rsp), %ymm0, %ymm0
+; CHECK-NEXT:    vmovq %xmm0, %r11
+; CHECK-NEXT:    andl $1, %r10d
 ; CHECK-NEXT:    andl $1, %esi
-; CHECK-NEXT:    addq %r8, %rsi
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    addq %r10, %rsi
+; CHECK-NEXT:    andl $1, %r8d
 ; CHECK-NEXT:    andl $1, %ecx
-; CHECK-NEXT:    addq %rdi, %rcx
+; CHECK-NEXT:    addq %r8, %rcx
 ; CHECK-NEXT:    addq %rsi, %rcx
-; CHECK-NEXT:    addq %rdx, %rcx
-; CHECK-NEXT:    vmovq %rcx, %xmm1
-; CHECK-NEXT:    vpblendd {{.*#+}} ymm0 = ymm1[0,1],ymm0[2,3,4,5,6,7]
+; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    addq %r11, %rdx
+; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    andl $1, %r9d
+; CHECK-NEXT:    addq %rdi, %r9
+; CHECK-NEXT:    addq %rdx, %r9
+; CHECK-NEXT:    addq %rcx, %r9
+; CHECK-NEXT:    vmovq %r9, %xmm0
 ; CHECK-NEXT:    vmovdqu %ymm0, (%rax)
 ; CHECK-NEXT:    vzeroupper
 ; CHECK-NEXT:    retq



More information about the llvm-commits mailing list