[llvm] [DAGCombine] Propagate truncate to operands (PR #98666)

Justin Fargnoli via llvm-commits llvm-commits at lists.llvm.org
Tue Sep 24 17:05:19 PDT 2024


https://github.com/justinfargnoli updated https://github.com/llvm/llvm-project/pull/98666

>From 2baa75ae52e81052e595f0531a2f3029727e6bb4 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 30 Jun 2024 18:50:30 -0700
Subject: [PATCH 01/56] Initial commit

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 54 ++++++++++++++++++++-
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 10 ++++
 2 files changed, 63 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/NVPTX/combine-truncate.ll

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 8812136733fb24..d34dad80f5dd00 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -747,7 +747,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::VSELECT});
+                       ISD::TRUNCATE, ISD::VSELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5704,6 +5704,52 @@ static SDValue PerformREMCombine(SDNode *N,
   return SDValue();
 }
 
+// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
+// This will reduce register pressure. 
+static SDValue PerformTruncCombine(SDNode *N,
+                                   TargetLowering::DAGCombinerInfo &DCI) {
+  if (DCI.isBeforeLegalizeOps()) {
+    SDValue LogicalOp = N->getOperand(0);
+    switch (LogicalOp.getOpcode()) {
+    default: 
+      break;
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR: {
+      EVT VT = N->getValueType(0);
+      EVT LogicalVT = LogicalOp.getValueType();
+      if (VT == MVT::i32 && LogicalVT == MVT::i64) {
+        const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+        if (VT.isScalarInteger() ||
+            TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
+          if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+                return U->isMachineOpcode() ? 
+                       U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
+                       U->getOpcode() == ISD::TRUNCATE;
+              })) {
+            
+            SDLoc DL(N);
+            SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
+                                                     DL, MVT::i32);
+            SDNode *NarrowL = DCI.DAG.getMachineNode(
+                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
+            SDNode *NarrowR = DCI.DAG.getMachineNode(
+                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
+            return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+                                   SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+          }
+        }
+      }
+      break;
+    }
+    }
+  }
+  return SDValue();
+}
+
 enum OperandSignedness {
   Signed = 0,
   Unsigned,
@@ -6126,6 +6172,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::UREM:
     case ISD::SREM:
       return PerformREMCombine(N, DCI, OptLevel);
+    case ISD::TRUNCATE:
+      return PerformTruncCombine(N, DCI);
     case ISD::SETCC:
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
@@ -6143,6 +6191,10 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::VSELECT:
       return PerformVSELECTCombine(N, DCI);
   }
+
+  if (N->isMachineOpcode() && N->getMachineOpcode() == NVPTX::CVT_u32_u64)
+    return PerformTruncCombine(N, DCI);
+
   return SDValue();
 }
 
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
new file mode 100644
index 00000000000000..f43887064acc0b
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -0,0 +1,10 @@
+; RUN: llc < %s | FileCheck %s
+
+define i32 @foo(i64 %a, i64 %b) {
+; CHECK: or.b32
+; CHECK-NOT: or.b64
+entry:
+  %or = or i64 %a, %b
+  %trunc = trunc i64 %or to i32
+  ret i32 %trunc
+}

>From 643fc52fcbad526d63775cf0b09ce09e8810318e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:07:53 -0700
Subject: [PATCH 02/56] Prefer early return

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 71 +++++++++++----------
 1 file changed, 36 insertions(+), 35 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index d34dad80f5dd00..f3f2c119add153 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5708,45 +5708,46 @@ static SDValue PerformREMCombine(SDNode *N,
 // This will reduce register pressure. 
 static SDValue PerformTruncCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
-  if (DCI.isBeforeLegalizeOps()) {
-    SDValue LogicalOp = N->getOperand(0);
-    switch (LogicalOp.getOpcode()) {
-    default: 
-      break;
-    case ISD::ADD:
-    case ISD::SUB:
-    case ISD::MUL:
-    case ISD::AND:
-    case ISD::OR:
-    case ISD::XOR: {
-      EVT VT = N->getValueType(0);
-      EVT LogicalVT = LogicalOp.getValueType();
-      if (VT == MVT::i32 && LogicalVT == MVT::i64) {
-        const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-        if (VT.isScalarInteger() ||
-            TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
-          if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-                return U->isMachineOpcode() ? 
-                       U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
-                       U->getOpcode() == ISD::TRUNCATE;
-              })) {
-            
-            SDLoc DL(N);
-            SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
-                                                     DL, MVT::i32);
-            SDNode *NarrowL = DCI.DAG.getMachineNode(
-                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
-            SDNode *NarrowR = DCI.DAG.getMachineNode(
-                NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
-            return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                                   SDValue(NarrowL, 0), SDValue(NarrowR, 0));
-          }
+  if (!DCI.isBeforeLegalizeOps())
+    return SDValue();
+
+  SDValue LogicalOp = N->getOperand(0);
+  switch (LogicalOp.getOpcode()) {
+  default: 
+    break;
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR: {
+    EVT VT = N->getValueType(0);
+    EVT LogicalVT = LogicalOp.getValueType();
+    if (VT == MVT::i32 && LogicalVT == MVT::i64) {
+      const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+      if (VT.isScalarInteger() ||
+          TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
+        if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+              return U->isMachineOpcode() ? 
+                      U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
+                      U->getOpcode() == ISD::TRUNCATE;
+            })) {          
+          SDLoc DL(N);
+          SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
+                                                    DL, MVT::i32);
+          SDNode *NarrowL = DCI.DAG.getMachineNode(
+              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
+          SDNode *NarrowR = DCI.DAG.getMachineNode(
+              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
+          return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+                                  SDValue(NarrowL, 0), SDValue(NarrowR, 0));
         }
       }
-      break;
-    }
     }
+    break;
   }
+  }
+
   return SDValue();
 }
 

>From b81ba1179fc12b8b7bbc291a8a9919d6ca97f552 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:08:15 -0700
Subject: [PATCH 03/56] clang-format

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index f3f2c119add153..51555c164b34a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5705,7 +5705,7 @@ static SDValue PerformREMCombine(SDNode *N,
 }
 
 // truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
-// This will reduce register pressure. 
+// This will reduce register pressure.
 static SDValue PerformTruncCombine(SDNode *N,
                                    TargetLowering::DAGCombinerInfo &DCI) {
   if (!DCI.isBeforeLegalizeOps())
@@ -5713,7 +5713,7 @@ static SDValue PerformTruncCombine(SDNode *N,
 
   SDValue LogicalOp = N->getOperand(0);
   switch (LogicalOp.getOpcode()) {
-  default: 
+  default:
     break;
   case ISD::ADD:
   case ISD::SUB:
@@ -5728,19 +5728,19 @@ static SDValue PerformTruncCombine(SDNode *N,
       if (VT.isScalarInteger() ||
           TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
         if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-              return U->isMachineOpcode() ? 
-                      U->getMachineOpcode() == NVPTX::CVT_u32_u64 : 
-                      U->getOpcode() == ISD::TRUNCATE;
-            })) {          
+              return U->isMachineOpcode()
+                         ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+                         : U->getOpcode() == ISD::TRUNCATE;
+            })) {
           SDLoc DL(N);
-          SDValue None = DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, 
-                                                    DL, MVT::i32);
+          SDValue None =
+              DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
           SDNode *NarrowL = DCI.DAG.getMachineNode(
               NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
           SDNode *NarrowR = DCI.DAG.getMachineNode(
               NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
           return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                                  SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+                                 SDValue(NarrowL, 0), SDValue(NarrowR, 0));
         }
       }
     }

>From fca39135197e5115f3d90b82f7757c988685db26 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 8 Jul 2024 14:17:55 -0700
Subject: [PATCH 04/56] Add negative test for `ISD::TRUNCATE`

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 14 +++++++++++++-
 1 file changed, 13 insertions(+), 1 deletion(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index f43887064acc0b..3d0827606dbc29 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s | FileCheck %s
 
-define i32 @foo(i64 %a, i64 %b) {
+define i32 @trunc(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc
 ; CHECK: or.b32
 ; CHECK-NOT: or.b64
 entry:
@@ -8,3 +9,14 @@ entry:
   %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }
+
+define i32 @trunc_not(i64 %a, i64 %b, ptr %p) {
+; CHECK-LABEL: trunc_not
+; CHECK: or.b64
+; CHECK-NOT: or.b32
+entry:
+  %or = or i64 %a, %b
+  %trunc = trunc i64 %or to i32
+  store i64 %or, ptr %p
+  ret i32 %trunc
+}

>From 2438cb9ad505242b2d67d44e1b6c61ab5fc2fa30 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:42:56 -0700
Subject: [PATCH 05/56] Add cvt and cvt_not test

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 90 ++++++++++++++++++---
 1 file changed, 79 insertions(+), 11 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 3d0827606dbc29..1e0d63ca248808 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -1,22 +1,90 @@
-; RUN: llc < %s | FileCheck %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march=nvptx64 | FileCheck %s
+; RUN: %if ptxas %{ llc < %s -march=nvptx64 | %ptxas-verify %}
+
+target triple = "nvptx64-nvidia-cuda"
 
 define i32 @trunc(i64 %a, i64 %b) {
-; CHECK-LABEL: trunc
-; CHECK: or.b32
-; CHECK-NOT: or.b64
-entry:
+; CHECK-LABEL: trunc(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_param_1];
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    or.b32 %r3, %r2, %r1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT:    ret;
   %or = or i64 %a, %b
   %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }
 
-define i32 @trunc_not(i64 %a, i64 %b, ptr %p) {
-; CHECK-LABEL: trunc_not
-; CHECK: or.b64
-; CHECK-NOT: or.b32
-entry:
+define i32 @trunc_not(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_not(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_not_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_not_param_1];
+; CHECK-NEXT:    or.b64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd3;
+; CHECK-NEXT:    mov.u64 %rd4, 0;
+; CHECK-NEXT:    st.u64 [%rd4], %rd3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %or = or i64 %a, %b
   %trunc = trunc i64 %or to i32
-  store i64 %or, ptr %p
+  store i64 %or, ptr null
+  ret i32 %trunc
+}
+
+define i32 @trunc_cvt(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_cvt(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_param_1];
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
+; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT:    ret;
+  %add = add i64 %a, %b
+  %or = or i64 %add, %a
+  %trunc = trunc i64 %or to i32
+  ret i32 %trunc
+}
+
+define i32 @trunc_cvt_not(i64 %a, i64 %b) {
+; CHECK-LABEL: trunc_cvt_not(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<5>;
+; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_not_param_0];
+; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_not_param_1];
+; CHECK-NEXT:    mov.u64 %rd3, 0;
+; CHECK-NEXT:    st.u64 [%rd3], %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
+; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
+; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT:    ret;
+  %add = add i64 %a, %b
+  store i64 %b, ptr null
+  %or = or i64 %add, %a
+  %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }

>From 989a677331763f8308fe8aa2fd96d6148b3deae9 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:52:32 -0700
Subject: [PATCH 06/56] Prefer early return

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 44 ++++++++++-----------
 1 file changed, 22 insertions(+), 22 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 51555c164b34a7..7deb5b864be645 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5723,28 +5723,28 @@ static SDValue PerformTruncCombine(SDNode *N,
   case ISD::XOR: {
     EVT VT = N->getValueType(0);
     EVT LogicalVT = LogicalOp.getValueType();
-    if (VT == MVT::i32 && LogicalVT == MVT::i64) {
-      const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-      if (VT.isScalarInteger() ||
-          TLI.isOperationLegal(LogicalOp.getOpcode(), VT)) {
-        if (all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-              return U->isMachineOpcode()
-                         ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
-                         : U->getOpcode() == ISD::TRUNCATE;
-            })) {
-          SDLoc DL(N);
-          SDValue None =
-              DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
-          SDNode *NarrowL = DCI.DAG.getMachineNode(
-              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), None);
-          SDNode *NarrowR = DCI.DAG.getMachineNode(
-              NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), None);
-          return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                                 SDValue(NarrowL, 0), SDValue(NarrowR, 0));
-        }
-      }
-    }
-    break;
+    if (VT != MVT::i32 || LogicalVT != MVT::i64) 
+      break;
+    const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
+    if (!VT.isScalarInteger() && 
+        !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
+      break;
+    if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
+          return U->isMachineOpcode()
+                      ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+                      : U->getOpcode() == ISD::TRUNCATE;
+        }))
+      break;
+
+    SDLoc DL(N);
+    SDValue CVTNone =
+        DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
+    SDNode *NarrowL = DCI.DAG.getMachineNode(
+        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), CVTNone);
+    SDNode *NarrowR = DCI.DAG.getMachineNode(
+        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), CVTNone);
+    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
+                            SDValue(NarrowL, 0), SDValue(NarrowR, 0));
   }
   }
 

>From 57cc226a720b7aedac0d73ea9dbe80d45add428c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:53:08 -0700
Subject: [PATCH 07/56] clang-format

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 7deb5b864be645..1b9ff609b356a7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -5723,28 +5723,28 @@ static SDValue PerformTruncCombine(SDNode *N,
   case ISD::XOR: {
     EVT VT = N->getValueType(0);
     EVT LogicalVT = LogicalOp.getValueType();
-    if (VT != MVT::i32 || LogicalVT != MVT::i64) 
+    if (VT != MVT::i32 || LogicalVT != MVT::i64)
       break;
     const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-    if (!VT.isScalarInteger() && 
+    if (!VT.isScalarInteger() &&
         !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
       break;
     if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
           return U->isMachineOpcode()
-                      ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
-                      : U->getOpcode() == ISD::TRUNCATE;
+                     ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
+                     : U->getOpcode() == ISD::TRUNCATE;
         }))
       break;
 
     SDLoc DL(N);
     SDValue CVTNone =
         DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
-    SDNode *NarrowL = DCI.DAG.getMachineNode(
-        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(0), CVTNone);
-    SDNode *NarrowR = DCI.DAG.getMachineNode(
-        NVPTX::CVT_u32_u64, DL, VT, LogicalOp.getOperand(1), CVTNone);
-    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT,
-                            SDValue(NarrowL, 0), SDValue(NarrowR, 0));
+    SDNode *NarrowL = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
+                                             LogicalOp.getOperand(0), CVTNone);
+    SDNode *NarrowR = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
+                                             LogicalOp.getOperand(1), CVTNone);
+    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT, SDValue(NarrowL, 0),
+                           SDValue(NarrowR, 0));
   }
   }
 

>From e0d5eef8d5c1c6027f1ec73b7302f93489e68b2a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 11 Jul 2024 19:55:20 -0700
Subject: [PATCH 08/56] Correct trunc_cvt_not test

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 1e0d63ca248808..30e415ebe95278 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -68,22 +68,22 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
 define i32 @trunc_cvt_not(i64 %a, i64 %b) {
 ; CHECK-LABEL: trunc_cvt_not(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<4>;
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-NEXT:    .reg .b64 %rd<5>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_not_param_0];
 ; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_not_param_1];
-; CHECK-NEXT:    mov.u64 %rd3, 0;
-; CHECK-NEXT:    st.u64 [%rd3], %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
+; CHECK-NEXT:    add.s64 %rd3, %rd1, %rd2;
+; CHECK-NEXT:    mov.u64 %rd4, 0;
+; CHECK-NEXT:    st.u64 [%rd4], %rd3;
+; CHECK-NEXT:    cvt.u32.u64 %r1, %rd3;
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
-; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
 ; CHECK-NEXT:    ret;
   %add = add i64 %a, %b
-  store i64 %b, ptr null
+  store i64 %add, ptr null
   %or = or i64 %add, %a
   %trunc = trunc i64 %or to i32
   ret i32 %trunc

>From 19725eb49bb78079ce0ad236f3d6afe8285cc9e7 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 15 Jul 2024 14:48:10 -0700
Subject: [PATCH 09/56] Delete NVPTXISelLowering implementation

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 54 +--------------------
 1 file changed, 1 insertion(+), 53 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index 1b9ff609b356a7..a913de8fa71acd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -747,7 +747,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::TRUNCATE, ISD::VSELECT});
+                       ISD::VSELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -5704,53 +5704,6 @@ static SDValue PerformREMCombine(SDNode *N,
   return SDValue();
 }
 
-// truncate (logic_op x, y) --> logic_op (truncate x), (truncate y)
-// This will reduce register pressure.
-static SDValue PerformTruncCombine(SDNode *N,
-                                   TargetLowering::DAGCombinerInfo &DCI) {
-  if (!DCI.isBeforeLegalizeOps())
-    return SDValue();
-
-  SDValue LogicalOp = N->getOperand(0);
-  switch (LogicalOp.getOpcode()) {
-  default:
-    break;
-  case ISD::ADD:
-  case ISD::SUB:
-  case ISD::MUL:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR: {
-    EVT VT = N->getValueType(0);
-    EVT LogicalVT = LogicalOp.getValueType();
-    if (VT != MVT::i32 || LogicalVT != MVT::i64)
-      break;
-    const TargetLowering &TLI = DCI.DAG.getTargetLoweringInfo();
-    if (!VT.isScalarInteger() &&
-        !TLI.isOperationLegal(LogicalOp.getOpcode(), VT))
-      break;
-    if (!all_of(LogicalOp.getNode()->uses(), [](SDNode *U) {
-          return U->isMachineOpcode()
-                     ? U->getMachineOpcode() == NVPTX::CVT_u32_u64
-                     : U->getOpcode() == ISD::TRUNCATE;
-        }))
-      break;
-
-    SDLoc DL(N);
-    SDValue CVTNone =
-        DCI.DAG.getTargetConstant(NVPTX::PTXCvtMode::NONE, DL, MVT::i32);
-    SDNode *NarrowL = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
-                                             LogicalOp.getOperand(0), CVTNone);
-    SDNode *NarrowR = DCI.DAG.getMachineNode(NVPTX::CVT_u32_u64, DL, VT,
-                                             LogicalOp.getOperand(1), CVTNone);
-    return DCI.DAG.getNode(LogicalOp.getOpcode(), DL, VT, SDValue(NarrowL, 0),
-                           SDValue(NarrowR, 0));
-  }
-  }
-
-  return SDValue();
-}
-
 enum OperandSignedness {
   Signed = 0,
   Unsigned,
@@ -6173,8 +6126,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::UREM:
     case ISD::SREM:
       return PerformREMCombine(N, DCI, OptLevel);
-    case ISD::TRUNCATE:
-      return PerformTruncCombine(N, DCI);
     case ISD::SETCC:
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
@@ -6193,9 +6144,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformVSELECTCombine(N, DCI);
   }
 
-  if (N->isMachineOpcode() && N->getMachineOpcode() == NVPTX::CVT_u32_u64)
-    return PerformTruncCombine(N, DCI);
-
   return SDValue();
 }
 

>From 6e1ed9bbdfc30dc90f5753ef9da30b53eb87d955 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 15 Jul 2024 15:12:58 -0700
Subject: [PATCH 10/56] Perform optimziation via DAGCombine + TLI

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  2 ++
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 +++--
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  4 ++++
 llvm/test/CodeGen/NVPTX/combine-truncate.ll   | 20 +++++++------------
 4 files changed, 16 insertions(+), 15 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 3842af56e6b3d7..cc44e80c4d65f7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2988,6 +2988,8 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
+  virtual bool shouldReduceRegisterPressure() const { return false; }
+
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
 
   /// Return true if the extension represented by \p I is free.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index c6f6fc25080541..47f5677c5a182e 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5902,7 +5902,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
-  if (HandOpcode == ISD::TRUNCATE) {
+  if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
     if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -15484,7 +15484,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     if (!LegalOperations && N0.hasOneUse() &&
-        (isConstantOrConstantVector(N0.getOperand(0), true) ||
+        (TLI.shouldReduceRegisterPressure() ||
+         isConstantOrConstantVector(N0.getOperand(0), true) ||
          isConstantOrConstantVector(N0.getOperand(1), true))) {
       // TODO: We already restricted this to pre-legalization, but for vectors
       // we are extra cautious to not create an unsupported operation.
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 70e16eee346aa2..374c418cd21583 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,6 +506,10 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
+  bool shouldReduceRegisterPressure() const override {
+    return true;
+  }
+
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {
     if (VT.isVector())
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 30e415ebe95278..a2b4c444e920f0 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -8,14 +8,11 @@ define i32 @trunc(i64 %a, i64 %b) {
 ; CHECK-LABEL: trunc(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_param_1];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    or.b32 %r3, %r2, %r1;
+; CHECK-NEXT:    ld.param.u32 %r1, [trunc_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [trunc_param_1];
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
 ; CHECK-NEXT:    ret;
   %or = or i64 %a, %b
@@ -48,15 +45,12 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
 ; CHECK-LABEL: trunc_cvt(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<5>;
-; CHECK-NEXT:    .reg .b64 %rd<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_param_0];
-; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_param_1];
-; CHECK-NEXT:    cvt.u32.u64 %r1, %rd2;
-; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
-; CHECK-NEXT:    add.s32 %r3, %r2, %r1;
-; CHECK-NEXT:    or.b32 %r4, %r3, %r2;
+; CHECK-NEXT:    ld.param.u32 %r1, [trunc_cvt_param_0];
+; CHECK-NEXT:    ld.param.u32 %r2, [trunc_cvt_param_1];
+; CHECK-NEXT:    add.s32 %r3, %r1, %r2;
+; CHECK-NEXT:    or.b32 %r4, %r3, %r1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r4;
 ; CHECK-NEXT:    ret;
   %add = add i64 %a, %b

>From 3b501524801b7dd114696726332b081c5444582e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 14:01:32 -0700
Subject: [PATCH 11/56] Update variadics-backend.ll

---
 llvm/test/CodeGen/NVPTX/variadics-backend.ll | 65 +++++++++-----------
 1 file changed, 30 insertions(+), 35 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/variadics-backend.ll b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
index 0e0c89d3e0214f..59619fa009d1e2 100644
--- a/llvm/test/CodeGen/NVPTX/variadics-backend.ll
+++ b/llvm/test/CodeGen/NVPTX/variadics-backend.ll
@@ -11,8 +11,8 @@
 define dso_local i32 @variadics1(i32 noundef %first, ...) {
 ; CHECK-PTX-LABEL: variadics1(
 ; CHECK-PTX:       {
-; CHECK-PTX-NEXT:    .reg .b32 %r<11>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<12>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-NEXT:    .reg .f64 %fd<7>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
@@ -26,23 +26,21 @@ define dso_local i32 @variadics1(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    add.s32 %r7, %r5, %r6;
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 19;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3];
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd5, %r7;
-; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, %rd4;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r8, %rd6;
-; CHECK-PTX-NEXT:    add.s64 %rd7, %rd3, 15;
-; CHECK-PTX-NEXT:    and.b64 %rd8, %rd7, -8;
-; CHECK-PTX-NEXT:    ld.f64 %fd1, [%rd8];
-; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd2, %r8;
+; CHECK-PTX-NEXT:    ld.u32 %r8, [%rd3];
+; CHECK-PTX-NEXT:    add.s32 %r9, %r7, %r8;
+; CHECK-PTX-NEXT:    add.s64 %rd4, %rd3, 15;
+; CHECK-PTX-NEXT:    and.b64 %rd5, %rd4, -8;
+; CHECK-PTX-NEXT:    ld.f64 %fd1, [%rd5];
+; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd2, %r9;
 ; CHECK-PTX-NEXT:    add.rn.f64 %fd3, %fd2, %fd1;
-; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r9, %fd3;
-; CHECK-PTX-NEXT:    add.s64 %rd9, %rd8, 15;
-; CHECK-PTX-NEXT:    and.b64 %rd10, %rd9, -8;
-; CHECK-PTX-NEXT:    ld.f64 %fd4, [%rd10];
-; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd5, %r9;
+; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %fd3;
+; CHECK-PTX-NEXT:    add.s64 %rd6, %rd5, 15;
+; CHECK-PTX-NEXT:    and.b64 %rd7, %rd6, -8;
+; CHECK-PTX-NEXT:    ld.f64 %fd4, [%rd7];
+; CHECK-PTX-NEXT:    cvt.rn.f64.s32 %fd5, %r10;
 ; CHECK-PTX-NEXT:    add.rn.f64 %fd6, %fd5, %fd4;
-; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r10, %fd6;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r10;
+; CHECK-PTX-NEXT:    cvt.rzi.s32.f64 %r11, %fd6;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r11;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8
@@ -152,8 +150,8 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    .reg .b64 %SP;
 ; CHECK-PTX-NEXT:    .reg .b64 %SPL;
 ; CHECK-PTX-NEXT:    .reg .b16 %rs<6>;
-; CHECK-PTX-NEXT:    .reg .b32 %r<7>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<11>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<8>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<8>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    mov.u64 %SPL, __local_depot2;
@@ -175,13 +173,11 @@ define dso_local i32 @variadics2(i32 noundef %first, ...) {
 ; CHECK-PTX-NEXT:    shl.b16 %rs4, %rs3, 8;
 ; CHECK-PTX-NEXT:    or.b16 %rs5, %rs4, %rs2;
 ; CHECK-PTX-NEXT:    st.u16 [%SP+0], %rs5;
-; CHECK-PTX-NEXT:    ld.u64 %rd8, [%rd3+8];
-; CHECK-PTX-NEXT:    add.s32 %r4, %r1, %r2;
-; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r3;
-; CHECK-PTX-NEXT:    cvt.u64.u32 %rd9, %r5;
-; CHECK-PTX-NEXT:    add.s64 %rd10, %rd9, %rd8;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r6, %rd10;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r6;
+; CHECK-PTX-NEXT:    ld.u32 %r4, [%rd3+8];
+; CHECK-PTX-NEXT:    add.s32 %r5, %r1, %r2;
+; CHECK-PTX-NEXT:    add.s32 %r6, %r5, %r3;
+; CHECK-PTX-NEXT:    add.s32 %r7, %r6, %r4;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r7;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8
@@ -347,20 +343,19 @@ entry:
 define dso_local i32 @variadics4(ptr noundef byval(%struct.S2) align 8 %first, ...) {
 ; CHECK-PTX-LABEL: variadics4(
 ; CHECK-PTX:       {
-; CHECK-PTX-NEXT:    .reg .b32 %r<2>;
-; CHECK-PTX-NEXT:    .reg .b64 %rd<9>;
+; CHECK-PTX-NEXT:    .reg .b32 %r<6>;
+; CHECK-PTX-NEXT:    .reg .b64 %rd<4>;
 ; CHECK-PTX-EMPTY:
 ; CHECK-PTX-NEXT:  // %bb.0: // %entry
 ; CHECK-PTX-NEXT:    ld.param.u64 %rd1, [variadics4_param_1];
 ; CHECK-PTX-NEXT:    add.s64 %rd2, %rd1, 7;
 ; CHECK-PTX-NEXT:    and.b64 %rd3, %rd2, -8;
-; CHECK-PTX-NEXT:    ld.u64 %rd4, [%rd3];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd5, [variadics4_param_0];
-; CHECK-PTX-NEXT:    ld.param.u64 %rd6, [variadics4_param_0+8];
-; CHECK-PTX-NEXT:    add.s64 %rd7, %rd5, %rd6;
-; CHECK-PTX-NEXT:    add.s64 %rd8, %rd7, %rd4;
-; CHECK-PTX-NEXT:    cvt.u32.u64 %r1, %rd8;
-; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-PTX-NEXT:    ld.u32 %r1, [%rd3];
+; CHECK-PTX-NEXT:    ld.param.u32 %r2, [variadics4_param_0];
+; CHECK-PTX-NEXT:    ld.param.u32 %r3, [variadics4_param_0+8];
+; CHECK-PTX-NEXT:    add.s32 %r4, %r2, %r3;
+; CHECK-PTX-NEXT:    add.s32 %r5, %r4, %r1;
+; CHECK-PTX-NEXT:    st.param.b32 [func_retval0+0], %r5;
 ; CHECK-PTX-NEXT:    ret;
 entry:
   %vlist = alloca ptr, align 8

>From 825c9caf929fae0b024da6632a2ac99dfa24798c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 17:49:33 -0700
Subject: [PATCH 12/56] Save work

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 ++++++++++++++++++++-
 1 file changed, 29 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a913de8fa71acd..ae291de7d38fed 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -747,7 +747,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::VSELECT});
+                       ISD::VSELECT, ISD::SELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6108,6 +6108,32 @@ static SDValue PerformLOADCombine(SDNode *N,
       DL);
 }
 
+// This transformations was once reliably performed by instcombine, but thanks
+// to poison semantics they are no longer safe for LLVM IR, perform them here
+// instead.
+static SDValue PerformSELECTCombine(SDNode *N,
+                                    TargetLowering::DAGCombinerInfo &DCI) {
+  return SDValue();
+  if (!(N->getValueType(0) == MVT::i1))
+    return SDValue();
+
+  unsigned Opcode = N->getOpcode();
+  SDValue SecondOperand;
+  if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(2)); Const && Const->isZero()) {
+    // (select cond, x, false) -> (and cond, x)
+    Opcode = ISD::AND;
+    SecondOperand = N->getOperand(1);
+  } else if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(1)); Const && Const->isOne()) {
+    // (select cond, true, x) -> (or cond, x)
+    Opcode = ISD::OR;
+    SecondOperand = N->getOperand(2);
+  } else {
+    return SDValue();
+  }
+
+  return DCI.DAG.getNode(Opcode, SDLoc(N), MVT::i1, N->getOperand(0), SecondOperand);
+}
+
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6130,6 +6156,8 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
       return PerformLOADCombine(N, DCI);
+    case ISD::SELECT:
+      return PerformSELECTCombine(N, DCI);
     case NVPTXISD::StoreRetval:
     case NVPTXISD::StoreRetvalV2:
     case NVPTXISD::StoreRetvalV4:

>From dbb360aceb2f2331190c873e60fb90aa5e7c58cf Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 17:53:12 -0700
Subject: [PATCH 13/56] Modify boolean-patterns.ll

---
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td     | 6 ------
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 6 ++----
 2 files changed, 2 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index f6bbf4c2ffc02f..da0075dbc26e9c 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1577,12 +1577,6 @@ defm XOR : BITWISE<"xor", xor>;
 def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
 def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
 
-// These transformations were once reliably performed by instcombine, but thanks
-// to poison semantics they are no longer safe for LLVM IR, perform them here
-// instead.
-def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
-def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
-
 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
 foreach vt = [v2i16, v4i8] in {
   def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),
diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index d38880599d1e62..d0b93763682398 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -18,16 +18,14 @@ define i1 @m2and_ri(i1 %a) {
 
 ; CHECK-LABEL: select2or
 define i1 @select2or(i1 %a, i1 %b) {
-; CHECK: or.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK-NOT: selp
+; CHECK: or.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
   %r = select i1 %a, i1 1, i1 %b
   ret i1 %r
 }
 
 ; CHECK-LABEL: select2and
 define i1 @select2and(i1 %a, i1 %b) {
-; CHECK: and.b16 %rs{{[0-9]+}}, %rs{{[0-9]+}}, %rs{{[0-9]+}}
-; CHECK-NOT: selp
+; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
   %r = select i1 %a, i1 %b, i1 0
   ret i1 %r
 }

>From 1232551cefad53ec4764f3eed7fc9a3ec72a8bd4 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 16 Jul 2024 18:51:45 -0700
Subject: [PATCH 14/56] Revert "Save work"

This reverts commit 680ac60d4e67194e6ba9c2ee8ed14fbe2b49cdc9.
---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 30 +--------------------
 1 file changed, 1 insertion(+), 29 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index ae291de7d38fed..a913de8fa71acd 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -747,7 +747,7 @@ NVPTXTargetLowering::NVPTXTargetLowering(const NVPTXTargetMachine &TM,
   // We have some custom DAG combine patterns for these nodes
   setTargetDAGCombine({ISD::ADD, ISD::AND, ISD::EXTRACT_VECTOR_ELT, ISD::FADD,
                        ISD::LOAD, ISD::MUL, ISD::SHL, ISD::SREM, ISD::UREM,
-                       ISD::VSELECT, ISD::SELECT});
+                       ISD::VSELECT});
 
   // setcc for f16x2 and bf16x2 needs special handling to prevent
   // legalizer's attempt to scalarize it due to v2i1 not being legal.
@@ -6108,32 +6108,6 @@ static SDValue PerformLOADCombine(SDNode *N,
       DL);
 }
 
-// This transformations was once reliably performed by instcombine, but thanks
-// to poison semantics they are no longer safe for LLVM IR, perform them here
-// instead.
-static SDValue PerformSELECTCombine(SDNode *N,
-                                    TargetLowering::DAGCombinerInfo &DCI) {
-  return SDValue();
-  if (!(N->getValueType(0) == MVT::i1))
-    return SDValue();
-
-  unsigned Opcode = N->getOpcode();
-  SDValue SecondOperand;
-  if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(2)); Const && Const->isZero()) {
-    // (select cond, x, false) -> (and cond, x)
-    Opcode = ISD::AND;
-    SecondOperand = N->getOperand(1);
-  } else if (auto Const = dyn_cast<ConstantSDNode>(N->getOperand(1)); Const && Const->isOne()) {
-    // (select cond, true, x) -> (or cond, x)
-    Opcode = ISD::OR;
-    SecondOperand = N->getOperand(2);
-  } else {
-    return SDValue();
-  }
-
-  return DCI.DAG.getNode(Opcode, SDLoc(N), MVT::i1, N->getOperand(0), SecondOperand);
-}
-
 SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
                                                DAGCombinerInfo &DCI) const {
   CodeGenOptLevel OptLevel = getTargetMachine().getOptLevel();
@@ -6156,8 +6130,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
       return PerformSETCCCombine(N, DCI, STI.getSmVersion());
     case ISD::LOAD:
       return PerformLOADCombine(N, DCI);
-    case ISD::SELECT:
-      return PerformSELECTCombine(N, DCI);
     case NVPTXISD::StoreRetval:
     case NVPTXISD::StoreRetvalV2:
     case NVPTXISD::StoreRetvalV4:

>From fb1d477d4aa4fb982343337b54d58d7c2a19a963 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:37:24 -0700
Subject: [PATCH 15/56] Save work

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 47f5677c5a182e..0950b7c7f0722c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15528,6 +15528,17 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
+  switch (N0.getOpcode()) {
+  case ISD::ADD:
+  case ISD::SUB:
+  case ISD::MUL:
+  case ISD::AND:
+  case ISD::OR:
+  case ISD::XOR:
+    // if ()
+    break; 
+  }
+
   return SDValue();
 }
 

>From af029549b37955622555c8ad1f404f7f07b24360 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:38:25 -0700
Subject: [PATCH 16/56] Remove improper TLI use

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0950b7c7f0722c..d11a5d8d915b86 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15484,8 +15484,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     if (!LegalOperations && N0.hasOneUse() &&
-        (TLI.shouldReduceRegisterPressure() ||
-         isConstantOrConstantVector(N0.getOperand(0), true) ||
+        (isConstantOrConstantVector(N0.getOperand(0), true) ||
          isConstantOrConstantVector(N0.getOperand(1), true))) {
       // TODO: We already restricted this to pre-legalization, but for vectors
       // we are extra cautious to not create an unsupported operation.

>From 23f158cbf61aea38ad714d63e4dccfc28921caf1 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:39:53 -0700
Subject: [PATCH 17/56] Remove white space

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
index a913de8fa71acd..8812136733fb24 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.cpp
@@ -6143,7 +6143,6 @@ SDValue NVPTXTargetLowering::PerformDAGCombine(SDNode *N,
     case ISD::VSELECT:
       return PerformVSELECTCombine(N, DCI);
   }
-
   return SDValue();
 }
 

>From 9bb99d8ba59ba9ddf694f8b041a090a00c105238 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 10:40:27 -0700
Subject: [PATCH 18/56] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     | 4 +---
 2 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d11a5d8d915b86..2d8e8e17e43eed 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15535,7 +15535,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
   case ISD::OR:
   case ISD::XOR:
     // if ()
-    break; 
+    break;
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 374c418cd21583..eff30087f07651 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,9 +506,7 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
-  bool shouldReduceRegisterPressure() const override {
-    return true;
-  }
+  bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {

>From c23b3cb0d5b2617c2cbd765a139c96ca658a78ac Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 17 Jul 2024 11:41:20 -0700
Subject: [PATCH 19/56] Implement transform in DAG combine

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 25 ++++++++++++-------
 llvm/lib/Target/NVPTX/NVPTXInstrInfo.td       |  6 +++++
 2 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2d8e8e17e43eed..d7fb662ff88e77 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15527,15 +15527,22 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
-  switch (N0.getOpcode()) {
-  case ISD::ADD:
-  case ISD::SUB:
-  case ISD::MUL:
-  case ISD::AND:
-  case ISD::OR:
-  case ISD::XOR:
-    // if ()
-    break;
+  if (TLI.shouldReduceRegisterPressure()) {
+    switch (N0.getOpcode()) {
+    case ISD::ADD:
+    case ISD::SUB:
+    case ISD::MUL:
+    case ISD::AND:
+    case ISD::OR:
+    case ISD::XOR:
+      if (!(N0.hasOneUse() && VT.isScalarInteger()))
+        break;
+      if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
+        break;
+      SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
+      SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
+      return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+    }
   }
 
   return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
index da0075dbc26e9c..f6bbf4c2ffc02f 100644
--- a/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
+++ b/llvm/lib/Target/NVPTX/NVPTXInstrInfo.td
@@ -1577,6 +1577,12 @@ defm XOR : BITWISE<"xor", xor>;
 def : Pat<(mul Int1Regs:$a, Int1Regs:$b), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
 def : Pat<(mul Int1Regs:$a, (i1 imm:$b)), (ANDb1ri Int1Regs:$a, imm:$b)>;
 
+// These transformations were once reliably performed by instcombine, but thanks
+// to poison semantics they are no longer safe for LLVM IR, perform them here
+// instead.
+def : Pat<(select Int1Regs:$a, Int1Regs:$b, 0), (ANDb1rr Int1Regs:$a, Int1Regs:$b)>;
+def : Pat<(select Int1Regs:$a, 1, Int1Regs:$b), (ORb1rr Int1Regs:$a, Int1Regs:$b)>;
+
 // Lower logical v2i16/v4i8 ops as bitwise ops on b32.
 foreach vt = [v2i16, v4i8] in {
   def: Pat<(or (vt Int32Regs:$a), (vt Int32Regs:$b)),

>From 323ad50469066b3eaa4773b0699fb060217d79b6 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:36:45 -0700
Subject: [PATCH 20/56] Save work

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 10 +++++++++-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index eff30087f07651..2a769b26312ced 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -500,12 +500,20 @@ class NVPTXTargetLowering : public TargetLowering {
 
   bool isTruncateFree(Type *SrcTy, Type *DstTy) const override {
     // Truncating 64-bit to 32-bit is free in SASS.
-    if (!SrcTy->isIntegerTy() || !DstTy->isIntegerTy())
+    if (!(SrcTy->isIntegerTy() && DstTy->isIntegerTy()))
       return false;
     return SrcTy->getPrimitiveSizeInBits() == 64 &&
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
+  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+    // Truncating 64-bit to 32-bit is free in SASS.
+    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+      return false;
+    return FromVT.getFixedSizeInBits() == 64 &&
+           ToVT.getFixedSizeInBits() == 32;
+  }
+
   bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,

>From dd35df16d0e902892eb3f3018315b0ef4cf3090a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:37:01 -0700
Subject: [PATCH 21/56] Remove isTruncateFree

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 2a769b26312ced..d486023a5205cf 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,14 +506,6 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
-  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
-    // Truncating 64-bit to 32-bit is free in SASS.
-    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
-      return false;
-    return FromVT.getFixedSizeInBits() == 64 &&
-           ToVT.getFixedSizeInBits() == 32;
-  }
-
   bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,

>From a33c90bf4463cd35c05a0a23d54a9611bb4e3c57 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:45:56 -0700
Subject: [PATCH 22/56] Use ptr in test

---
 llvm/test/CodeGen/NVPTX/combine-truncate.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index a2b4c444e920f0..221b89c7812e78 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -59,7 +59,7 @@ define i32 @trunc_cvt(i64 %a, i64 %b) {
   ret i32 %trunc
 }
 
-define i32 @trunc_cvt_not(i64 %a, i64 %b) {
+define i32 @trunc_cvt_not(i64 %a, i64 %b, ptr %p) {
 ; CHECK-LABEL: trunc_cvt_not(
 ; CHECK:       {
 ; CHECK-NEXT:    .reg .b32 %r<4>;
@@ -69,7 +69,7 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b) {
 ; CHECK-NEXT:    ld.param.u64 %rd1, [trunc_cvt_not_param_0];
 ; CHECK-NEXT:    ld.param.u64 %rd2, [trunc_cvt_not_param_1];
 ; CHECK-NEXT:    add.s64 %rd3, %rd1, %rd2;
-; CHECK-NEXT:    mov.u64 %rd4, 0;
+; CHECK-NEXT:    ld.param.u64 %rd4, [trunc_cvt_not_param_2];
 ; CHECK-NEXT:    st.u64 [%rd4], %rd3;
 ; CHECK-NEXT:    cvt.u32.u64 %r1, %rd3;
 ; CHECK-NEXT:    cvt.u32.u64 %r2, %rd1;
@@ -77,7 +77,7 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b) {
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
 ; CHECK-NEXT:    ret;
   %add = add i64 %a, %b
-  store i64 %add, ptr null
+  store i64 %add, ptr %p
   %or = or i64 %add, %a
   %trunc = trunc i64 %or to i32
   ret i32 %trunc

>From cf682f44cfe6ba37ab3e43554b247c2fd0978ac8 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 15:55:31 -0700
Subject: [PATCH 23/56] Only run on free truncs

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  8 ++++++++
 llvm/test/CodeGen/NVPTX/combine-truncate.ll   | 16 ++++++++++++++++
 3 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d7fb662ff88e77..cf703f910596a7 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15535,7 +15535,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
-      if (!(N0.hasOneUse() && VT.isScalarInteger()))
+      if (!(N0.hasOneUse() && VT.isScalarInteger() && TLI.isTruncateFree(SrcVT, VT)))
         break;
       if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
         break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index d486023a5205cf..2a769b26312ced 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,6 +506,14 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
+  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+    // Truncating 64-bit to 32-bit is free in SASS.
+    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+      return false;
+    return FromVT.getFixedSizeInBits() == 64 &&
+           ToVT.getFixedSizeInBits() == 32;
+  }
+
   bool shouldReduceRegisterPressure() const override { return true; }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
diff --git a/llvm/test/CodeGen/NVPTX/combine-truncate.ll b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
index 221b89c7812e78..1c1c091e39bb41 100644
--- a/llvm/test/CodeGen/NVPTX/combine-truncate.ll
+++ b/llvm/test/CodeGen/NVPTX/combine-truncate.ll
@@ -82,3 +82,19 @@ define i32 @trunc_cvt_not(i64 %a, i64 %b, ptr %p) {
   %trunc = trunc i64 %or to i32
   ret i32 %trunc
 }
+
+define i16 @trunc_i32_to_i16_not(i32 %a, i32 %b) {
+; CHECK-LABEL: trunc_i32_to_i16_not(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<4>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u16 %r1, [trunc_i32_to_i16_not_param_0];
+; CHECK-NEXT:    ld.param.u16 %r2, [trunc_i32_to_i16_not_param_1];
+; CHECK-NEXT:    or.b32 %r3, %r1, %r2;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r3;
+; CHECK-NEXT:    ret;
+  %or = or i32 %a, %b
+  %trunc = trunc i32 %or to i16
+  ret i16 %trunc
+}

>From 1645b5adf562a05efc26aba05c9aac0b7bccea6a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:04:56 -0700
Subject: [PATCH 24/56] Add comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index cf703f910596a7..06e40b0c0de198 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5902,6 +5902,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
+  // 
+  // For targets that are particulaly senesitve to register pressure it's preferable to
+  // increase the number of truncate instructions in order to decrease the bit 
+  // width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.

>From 64725364681b173de520d9523b245d80b83cc978 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:05:32 -0700
Subject: [PATCH 25/56] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 11 ++++++-----
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  3 +--
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 06e40b0c0de198..d1b4d82956f0dd 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5902,10 +5902,10 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
-  // 
-  // For targets that are particulaly senesitve to register pressure it's preferable to
-  // increase the number of truncate instructions in order to decrease the bit 
-  // width of the logic_op.
+  //
+  // For targets that are particulaly senesitve to register pressure it's
+  // preferable to increase the number of truncate instructions in order to
+  // decrease the bit width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
@@ -15539,7 +15539,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
-      if (!(N0.hasOneUse() && VT.isScalarInteger() && TLI.isTruncateFree(SrcVT, VT)))
+      if (!(N0.hasOneUse() && VT.isScalarInteger() &&
+            TLI.isTruncateFree(SrcVT, VT)))
         break;
       if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
         break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 2a769b26312ced..1d9d86b484ed05 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -510,8 +510,7 @@ class NVPTXTargetLowering : public TargetLowering {
     // Truncating 64-bit to 32-bit is free in SASS.
     if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
       return false;
-    return FromVT.getFixedSizeInBits() == 64 &&
-           ToVT.getFixedSizeInBits() == 32;
+    return FromVT.getFixedSizeInBits() == 64 && ToVT.getFixedSizeInBits() == 32;
   }
 
   bool shouldReduceRegisterPressure() const override { return true; }

>From 36ff055463603dcbb063579cad350705faa7f96d Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:06:47 -0700
Subject: [PATCH 26/56] explcitly mentino GPU in comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index d1b4d82956f0dd..1a04f18de41a10 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5903,7 +5903,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
   //
-  // For targets that are particulaly senesitve to register pressure it's
+  // For targets that are particulaly senesitve to register pressure (e.g. GPUs) it's
   // preferable to increase the number of truncate instructions in order to
   // decrease the bit width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {

>From d41162a15f425f5103550420ca5d583bb7c19ca5 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:06:59 -0700
Subject: [PATCH 27/56] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 1a04f18de41a10..a66547bea3e09f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5903,8 +5903,8 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
   //
-  // For targets that are particulaly senesitve to register pressure (e.g. GPUs) it's
-  // preferable to increase the number of truncate instructions in order to
+  // For targets that are particulaly senesitve to register pressure (e.g. GPUs)
+  // it's preferable to increase the number of truncate instructions in order to
   // decrease the bit width of the logic_op.
   if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
     // If both operands have other uses, this transform would create extra

>From af94339bb5d537368655b86d93f56b1ae2e9a626 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:11:56 -0700
Subject: [PATCH 28/56] Add comment to TLI function

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index cc44e80c4d65f7..0ccc4a0de7973d 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2988,6 +2988,8 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
+  // Return true if the target will accepts tradeoffs (e.g. increase the number 
+  // of instructions) to reduce register pressure.
   virtual bool shouldReduceRegisterPressure() const { return false; }
 
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }

>From 162be1fcedf8605fc951596d8c59e98c95009ab8 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:12:11 -0700
Subject: [PATCH 29/56] clang-format

---
 llvm/include/llvm/CodeGen/TargetLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 0ccc4a0de7973d..dac610c8f94124 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2988,7 +2988,7 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
-  // Return true if the target will accepts tradeoffs (e.g. increase the number 
+  // Return true if the target will accepts tradeoffs (e.g. increase the number
   // of instructions) to reduce register pressure.
   virtual bool shouldReduceRegisterPressure() const { return false; }
 

>From f94449aa3f20958d79e52e9e9d1b0a201ffe1a86 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 22 Jul 2024 16:41:32 -0700
Subject: [PATCH 30/56] Update boolean-patterns.ll

---
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 72 ++++++++++++++++++---
 1 file changed, 63 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index d0b93763682398..de0437cb96e398 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -1,31 +1,85 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc < %s -march=nvptx64 -mcpu=sm_20 | FileCheck %s
 ; RUN: %if ptxas %{ llc < %s -march=nvptx64 -mcpu=sm_20 | %ptxas-verify %}
 
-; CHECK-LABEL: m2and_rr
+target triple = "nvptx64-nvidia-cuda"
+
 define i1 @m2and_rr(i1 %a, i1 %b) {
-; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
-; CHECK-NOT: mul
+; CHECK-LABEL: m2and_rr(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [m2and_rr_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u8 %rs3, [m2and_rr_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT:    and.pred %p3, %p2, %p1;
+; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %r = mul i1 %a, %b
   ret i1 %r
 }
 
-; CHECK-LABEL: m2and_ri
 define i1 @m2and_ri(i1 %a) {
-; CHECK-NOT: mul
+; CHECK-LABEL: m2and_ri(
+; CHECK:       {
+; CHECK-NEXT:    .reg .b32 %r<3>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %r1, [m2and_ri_param_0];
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
+; CHECK-NEXT:    ret;
   %r = mul i1 %a, 1
   ret i1 %r
 }
 
-; CHECK-LABEL: select2or
 define i1 @select2or(i1 %a, i1 %b) {
-; CHECK: or.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
+; CHECK-LABEL: select2or(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u8 %rs3, [select2or_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT:    or.pred %p3, %p2, %p1;
+; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 1, i1 %b
   ret i1 %r
 }
 
-; CHECK-LABEL: select2and
 define i1 @select2and(i1 %a, i1 %b) {
-; CHECK: and.pred %p{{[0-9]+}}, %p{{[0-9]+}}, %p{{[0-9]+}}
+; CHECK-LABEL: select2and(
+; CHECK:       {
+; CHECK-NEXT:    .reg .pred %p<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
+; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-EMPTY:
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_1];
+; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
+; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; CHECK-NEXT:    ld.param.u8 %rs3, [select2and_param_0];
+; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
+; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
+; CHECK-NEXT:    and.pred %p3, %p2, %p1;
+; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 %b, i1 0
   ret i1 %r
 }

>From a7a9003b593c95828c84b1d9cdd48790d2186d4b Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 23 Jul 2024 16:12:00 -0700
Subject: [PATCH 31/56]  Use isNarrowingProfitable instead of
 shouldReduceRegisterPressure

---
 llvm/include/llvm/CodeGen/TargetLowering.h    |  4 --
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 18 ++++-----
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     |  8 ++--
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll   | 40 ++++++++-----------
 llvm/test/CodeGen/X86/apx/and.ll              |  6 +--
 ...-merge-scalar-constmask-interleavedbits.ll | 16 ++++----
 llvm/test/CodeGen/X86/vec_saddo.ll            |  8 +++-
 llvm/test/CodeGen/X86/vec_uaddo.ll            |  8 +++-
 8 files changed, 50 insertions(+), 58 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index dac610c8f94124..3842af56e6b3d7 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -2988,10 +2988,6 @@ class TargetLoweringBase {
     return isTruncateFree(Val.getValueType(), VT2);
   }
 
-  // Return true if the target will accepts tradeoffs (e.g. increase the number
-  // of instructions) to reduce register pressure.
-  virtual bool shouldReduceRegisterPressure() const { return false; }
-
   virtual bool isProfitableToHoist(Instruction *I) const { return true; }
 
   /// Return true if the extension represented by \p I is free.
diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index a66547bea3e09f..74b0ad726911bf 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5902,11 +5902,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
   }
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
-  //
-  // For targets that are particulaly senesitve to register pressure (e.g. GPUs)
-  // it's preferable to increase the number of truncate instructions in order to
-  // decrease the bit width of the logic_op.
-  if (HandOpcode == ISD::TRUNCATE && !TLI.shouldReduceRegisterPressure()) {
+  if (HandOpcode == ISD::TRUNCATE) {
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
     if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -5918,9 +5914,14 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     if (LegalOperations && !TLI.isOperationLegal(LogicOpcode, XVT))
       return SDValue();
     // Be extra careful sinking truncate. If it's free, there's no benefit in
-    // widening a binop. Also, don't create a logic op on an illegal type.
+    // widening a binop.
     if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
       return SDValue();
+    // Prevent an infinite loop if the target preferts the inverse 
+    // transformation.
+    if (TLI.isNarrowingProfitable(XVT, VT))
+      return SDValue();
+    // Don't create a logic op on an illegal type.
     if (!TLI.isTypeLegal(XVT))
       return SDValue();
     SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
@@ -15531,7 +15532,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
-  if (TLI.shouldReduceRegisterPressure()) {
+  if (TLI.isNarrowingProfitable(SrcVT, VT)) {
     switch (N0.getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:
@@ -15539,8 +15540,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     case ISD::AND:
     case ISD::OR:
     case ISD::XOR:
-      if (!(N0.hasOneUse() && VT.isScalarInteger() &&
-            TLI.isTruncateFree(SrcVT, VT)))
+      if (!(N0.hasOneUse() && VT.isScalarInteger()))
         break;
       if (LegalOperations && !TLI.isOperationLegal(N0.getOpcode(), VT))
         break;
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 1d9d86b484ed05..2874d50ae5b207 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,15 +506,13 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
-  bool isTruncateFree(EVT FromVT, EVT ToVT) const override {
+  bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override {
     // Truncating 64-bit to 32-bit is free in SASS.
-    if (!(FromVT.isScalarInteger() && ToVT.isScalarInteger()))
+    if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
       return false;
-    return FromVT.getFixedSizeInBits() == 64 && ToVT.getFixedSizeInBits() == 32;
+    return SrcVT.getFixedSizeInBits() == 64 && DestVT.getFixedSizeInBits() == 32;
   }
 
-  bool shouldReduceRegisterPressure() const override { return true; }
-
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,
                          EVT VT) const override {
     if (VT.isVector())
diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index de0437cb96e398..c30ebccdf9199c 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -43,20 +43,16 @@ define i1 @m2and_ri(i1 %a) {
 define i1 @select2or(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2or(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<4>;
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_1];
-; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT:    ld.param.u8 %rs3, [select2or_param_0];
-; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
-; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
-; CHECK-NEXT:    or.pred %p3, %p2, %p1;
-; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [select2or_param_1];
+; CHECK-NEXT:    or.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 1, i1 %b
   ret i1 %r
@@ -65,20 +61,16 @@ define i1 @select2or(i1 %a, i1 %b) {
 define i1 @select2and(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2and(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .pred %p<4>;
-; CHECK-NEXT:    .reg .b16 %rs<5>;
-; CHECK-NEXT:    .reg .b32 %r<2>;
+; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
-; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_1];
-; CHECK-NEXT:    and.b16 %rs2, %rs1, 1;
-; CHECK-NEXT:    setp.eq.b16 %p1, %rs2, 1;
-; CHECK-NEXT:    ld.param.u8 %rs3, [select2and_param_0];
-; CHECK-NEXT:    and.b16 %rs4, %rs3, 1;
-; CHECK-NEXT:    setp.eq.b16 %p2, %rs4, 1;
-; CHECK-NEXT:    and.pred %p3, %p2, %p1;
-; CHECK-NEXT:    selp.u32 %r1, 1, 0, %p3;
-; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r1;
+; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_0];
+; CHECK-NEXT:    ld.param.u8 %rs2, [select2and_param_1];
+; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    and.b32 %r2, %r1, 1;
+; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;
   %r = select i1 %a, i1 %b, i1 0
   ret i1 %r
diff --git a/llvm/test/CodeGen/X86/apx/and.ll b/llvm/test/CodeGen/X86/apx/and.ll
index 23aed77b948b97..de27976a122f7b 100644
--- a/llvm/test/CodeGen/X86/apx/and.ll
+++ b/llvm/test/CodeGen/X86/apx/and.ll
@@ -5,14 +5,12 @@
 define i8 @and8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: and8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    andl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x21,0xf7]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    andb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x20,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: and8rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} andl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x21,0xf7]
-; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    {nf} andb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x20,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %and = and i8 %a, %b
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
index c4c4e5ed1fddeb..2e2c152c5506ad 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbits.ll
@@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $85, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $85, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $85, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $85, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 460c5fe11f82a5..0aea89e4acf018 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,12 +1045,16 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: saddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index bac118095331ca..34e2811cb9bb5c 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,12 +1098,16 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
-; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
+; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
+; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
+; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
+; AVX512-NEXT:    kshiftlw $12, %k0, %k0
+; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq

>From d354239b5cf5df22f233facad8e5315bb1d476e7 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 23 Jul 2024 16:13:40 -0700
Subject: [PATCH 32/56] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h     | 3 ++-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 74b0ad726911bf..990c0d57660752 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5917,7 +5917,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     // widening a binop.
     if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
       return SDValue();
-    // Prevent an infinite loop if the target preferts the inverse 
+    // Prevent an infinite loop if the target preferts the inverse
     // transformation.
     if (TLI.isNarrowingProfitable(XVT, VT))
       return SDValue();
diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 2874d50ae5b207..32eee0cb301873 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -510,7 +510,8 @@ class NVPTXTargetLowering : public TargetLowering {
     // Truncating 64-bit to 32-bit is free in SASS.
     if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
       return false;
-    return SrcVT.getFixedSizeInBits() == 64 && DestVT.getFixedSizeInBits() == 32;
+    return SrcVT.getFixedSizeInBits() == 64 &&
+           DestVT.getFixedSizeInBits() == 32;
   }
 
   EVT getSetCCResultType(const DataLayout &DL, LLVMContext &Ctx,

>From 2a9ca867c39e9888b2a4b199320a46b1e9a08bd2 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 19:40:50 -0700
Subject: [PATCH 33/56] Update tests

---
 llvm/test/CodeGen/X86/2012-08-16-setcc.ll    | 10 ++---
 llvm/test/CodeGen/X86/add-and-not.ll         | 16 ++++----
 llvm/test/CodeGen/X86/and-encoding.ll        |  4 +-
 llvm/test/CodeGen/X86/and-with-overflow.ll   |  4 +-
 llvm/test/CodeGen/X86/atomic-unordered.ll    |  2 +-
 llvm/test/CodeGen/X86/avx512-calling-conv.ll |  8 ++--
 llvm/test/CodeGen/X86/avx512-intrinsics.ll   | 40 ++++++++++----------
 llvm/test/CodeGen/X86/avx512dq-mask-op.ll    |  2 +-
 llvm/test/CodeGen/X86/cmp-bool.ll            |  2 +-
 llvm/test/CodeGen/X86/cmp-concat.ll          |  2 +-
 llvm/test/CodeGen/X86/ctlo.ll                | 18 +++++----
 llvm/test/CodeGen/X86/ctlz.ll                | 32 ++++++++--------
 12 files changed, 71 insertions(+), 69 deletions(-)

diff --git a/llvm/test/CodeGen/X86/2012-08-16-setcc.ll b/llvm/test/CodeGen/X86/2012-08-16-setcc.ll
index 89ae5680e3ba94..f82439a7d82eec 100644
--- a/llvm/test/CodeGen/X86/2012-08-16-setcc.ll
+++ b/llvm/test/CodeGen/X86/2012-08-16-setcc.ll
@@ -6,8 +6,8 @@
 define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 ; CHECK-LABEL: and_1:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    xorl %eax, %eax
+; CHECK-NEXT:    testb %dil, %sil
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = and i8 %b, %a
@@ -19,7 +19,7 @@ define i32 @and_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 define zeroext i1 @and_2(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: and_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    testl %edi, %esi
+; CHECK-NEXT:    testb %dil, %sil
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
   %1 = and i8 %b, %a
@@ -31,7 +31,7 @@ define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 ; CHECK-LABEL: xor_1:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    xorl %eax, %eax
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %dil, %sil
 ; CHECK-NEXT:    cmovnel %edx, %eax
 ; CHECK-NEXT:    retq
   %1 = xor i8 %b, %a
@@ -43,7 +43,7 @@ define i32 @xor_1(i8 zeroext %a, i8 zeroext %b, i32 %x) {
 define zeroext i1 @xor_2(i8 zeroext %a, i8 zeroext %b) {
 ; CHECK-LABEL: xor_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %dil, %sil
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
   %1 = xor i8 %b, %a
diff --git a/llvm/test/CodeGen/X86/add-and-not.ll b/llvm/test/CodeGen/X86/add-and-not.ll
index 10e3a6bf6d5330..3f1798e2f9bdd4 100644
--- a/llvm/test/CodeGen/X86/add-and-not.ll
+++ b/llvm/test/CodeGen/X86/add-and-not.ll
@@ -13,8 +13,8 @@ define i8 @add_and_xor(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
@@ -79,8 +79,8 @@ define i8 @add_and_xor_commuted1(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor_commuted1:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
@@ -98,8 +98,8 @@ define i8 @add_and_xor_commuted2(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor_commuted2:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
@@ -117,8 +117,8 @@ define i8 @add_and_xor_commuted3(i8 %x, i8 %y) {
 ;
 ; X64-LABEL: add_and_xor_commuted3:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %xor = xor i8 %x, -1
diff --git a/llvm/test/CodeGen/X86/and-encoding.ll b/llvm/test/CodeGen/X86/and-encoding.ll
index 248686ff8b7a2b..64897a1d432221 100644
--- a/llvm/test/CodeGen/X86/and-encoding.ll
+++ b/llvm/test/CodeGen/X86/and-encoding.ll
@@ -22,7 +22,7 @@ define void @f1() nounwind {
 define void @f2(i16 %x, ptr%y) nounwind  {
 ; CHECK-LABEL: f2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01]
 ; CHECK-NEXT:    movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %c = trunc i16 %x to i1
@@ -33,7 +33,7 @@ define void @f2(i16 %x, ptr%y) nounwind  {
 define void @f3(i32 %x, ptr%y) nounwind {
 ; CHECK-LABEL: f3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl $1, %edi # encoding: [0x83,0xe7,0x01]
+; CHECK-NEXT:    andb $1, %dil # encoding: [0x40,0x80,0xe7,0x01]
 ; CHECK-NEXT:    movb %dil, (%rsi) # encoding: [0x40,0x88,0x3e]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
   %c = trunc i32 %x to i1
diff --git a/llvm/test/CodeGen/X86/and-with-overflow.ll b/llvm/test/CodeGen/X86/and-with-overflow.ll
index a63f6cc6ea7e26..70c8317915239a 100644
--- a/llvm/test/CodeGen/X86/and-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/and-with-overflow.ll
@@ -46,8 +46,8 @@ define i8 @and_i8_rr(i8 zeroext %0, i8 zeroext %1) {
 ;
 ; X64-LABEL: and_i8_rr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    andb %dil, %sil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/atomic-unordered.ll b/llvm/test/CodeGen/X86/atomic-unordered.ll
index 3fb994cdb751a3..eb7a30f4742772 100644
--- a/llvm/test/CodeGen/X86/atomic-unordered.ll
+++ b/llvm/test/CodeGen/X86/atomic-unordered.ll
@@ -2009,7 +2009,7 @@ define i32 @split_load(ptr %p) {
 ; CHECK-O3-NEXT:    movq (%rdi), %rax
 ; CHECK-O3-NEXT:    movq %rax, %rcx
 ; CHECK-O3-NEXT:    shrq $32, %rcx
-; CHECK-O3-NEXT:    orl %eax, %ecx
+; CHECK-O3-NEXT:    orb %al, %cl
 ; CHECK-O3-NEXT:    movzbl %cl, %eax
 ; CHECK-O3-NEXT:    retq
   %v = load atomic i64, ptr %p unordered, align 8
diff --git a/llvm/test/CodeGen/X86/avx512-calling-conv.ll b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
index c27cced9d5ffa7..93fdd929f835c7 100644
--- a/llvm/test/CodeGen/X86/avx512-calling-conv.ll
+++ b/llvm/test/CodeGen/X86/avx512-calling-conv.ll
@@ -931,7 +931,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL-NEXT:    kshiftrw $13, %k0, %k1
 ; KNL-NEXT:    kmovw %k1, %r13d
 ; KNL-NEXT:    kshiftrw $14, %k0, %k1
-; KNL-NEXT:    andl $1, %edx
+; KNL-NEXT:    andb $1, %dl
 ; KNL-NEXT:    movb %dl, 2(%rax)
 ; KNL-NEXT:    kmovw %k0, %edx
 ; KNL-NEXT:    andl $1, %edx
@@ -1244,7 +1244,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; SKX-NEXT:    kshiftrd $13, %k0, %k1
 ; SKX-NEXT:    kmovd %k1, %r13d
 ; SKX-NEXT:    kshiftrd $14, %k0, %k1
-; SKX-NEXT:    andl $1, %edx
+; SKX-NEXT:    andb $1, %dl
 ; SKX-NEXT:    movb %dl, 2(%rax)
 ; SKX-NEXT:    kmovd %k0, %edx
 ; SKX-NEXT:    andl $1, %edx
@@ -1550,7 +1550,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; KNL_X32-NEXT:    kshiftrw $5, %k0, %k1
 ; KNL_X32-NEXT:    kmovw %k1, %ecx
 ; KNL_X32-NEXT:    kshiftrw $6, %k0, %k1
-; KNL_X32-NEXT:    andl $1, %ebx
+; KNL_X32-NEXT:    andb $1, %bl
 ; KNL_X32-NEXT:    movb %bl, 2(%eax)
 ; KNL_X32-NEXT:    kmovw %k0, %ebx
 ; KNL_X32-NEXT:    andl $1, %ebx
@@ -1878,7 +1878,7 @@ define <17 x i1> @test16(<17 x i1> %a, <17 x i1> %b) nounwind {
 ; FASTISEL-NEXT:    kshiftrd $13, %k0, %k1
 ; FASTISEL-NEXT:    kmovd %k1, %r13d
 ; FASTISEL-NEXT:    kshiftrd $14, %k0, %k1
-; FASTISEL-NEXT:    andl $1, %edx
+; FASTISEL-NEXT:    andb $1, %dl
 ; FASTISEL-NEXT:    movb %dl, 2(%rax)
 ; FASTISEL-NEXT:    kmovd %k0, %edx
 ; FASTISEL-NEXT:    andl $1, %edx
diff --git a/llvm/test/CodeGen/X86/avx512-intrinsics.ll b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
index b77c753107a6e1..ff4a2f99c015d5 100644
--- a/llvm/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/avx512-intrinsics.ll
@@ -4548,17 +4548,17 @@ define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    orl %ecx, %edx
-; X64-NEXT:    orl %esi, %eax
-; X64-NEXT:    orl %edx, %eax
+; X64-NEXT:    orb %cl, %dl
+; X64-NEXT:    orb %sil, %al
+; X64-NEXT:    orb %dl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_sd_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vcmplesd %xmm1, %xmm0, %k0
@@ -4566,14 +4566,14 @@ define i8 at test_int_x86_avx512_mask_cmp_sd_all(<2 x double> %x0, <2 x double> %x1
 ; X86-NEXT:    vcmpunordsd {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
 ; X86-NEXT:    vcmpneqsd %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %esi
+; X86-NEXT:    kmovw %k0, %ebx
 ; X86-NEXT:    vcmpnltsd {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    orb %cl, %dl
+; X86-NEXT:    orb %bl, %al
+; X86-NEXT:    orb %dl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
 
@@ -4625,17 +4625,17 @@ define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ; X64-NEXT:    kmovw %k0, %esi
 ; X64-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X64-NEXT:    kmovw %k0, %eax
-; X64-NEXT:    andl %ecx, %edx
-; X64-NEXT:    andl %esi, %eax
-; X64-NEXT:    andl %edx, %eax
+; X64-NEXT:    andb %cl, %dl
+; X64-NEXT:    andb %sil, %al
+; X64-NEXT:    andb %dl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
 ; X86-LABEL: test_int_x86_avx512_mask_cmp_ss_all:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
+; X86-NEXT:    pushl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 8
-; X86-NEXT:    .cfi_offset %esi, -8
+; X86-NEXT:    .cfi_offset %ebx, -8
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    kmovw %eax, %k1
 ; X86-NEXT:    vcmpless %xmm1, %xmm0, %k0
@@ -4643,14 +4643,14 @@ define i8 at test_int_x86_avx512_mask_cmp_ss_all(<4 x float> %x0, <4 x float> %x1,
 ; X86-NEXT:    vcmpunordss {sae}, %xmm1, %xmm0, %k0
 ; X86-NEXT:    kmovw %k0, %edx
 ; X86-NEXT:    vcmpneqss %xmm1, %xmm0, %k0 {%k1}
-; X86-NEXT:    kmovw %k0, %esi
+; X86-NEXT:    kmovw %k0, %ebx
 ; X86-NEXT:    vcmpnltss {sae}, %xmm1, %xmm0, %k0 {%k1}
 ; X86-NEXT:    kmovw %k0, %eax
-; X86-NEXT:    andl %ecx, %edx
-; X86-NEXT:    andl %esi, %eax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andb %cl, %dl
+; X86-NEXT:    andb %bl, %al
+; X86-NEXT:    andb %dl, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %ebx
 ; X86-NEXT:    .cfi_def_cfa_offset 4
 ; X86-NEXT:    retl
   %res1 = call i8 @llvm.x86.avx512.mask.cmp.ss(<4 x float> %x0, <4 x float> %x1, i32 2, i8 -1, i32 4)
diff --git a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
index 041a86aff53fb6..17502f4a506f13 100644
--- a/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512dq-mask-op.ll
@@ -33,7 +33,7 @@ define i8 @mand8(i8 %x, i8 %y) {
 ; CHECK-LABEL: mand8:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %ma = bitcast i8 %x to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/cmp-bool.ll b/llvm/test/CodeGen/X86/cmp-bool.ll
index 617b485e0de0f2..5ba3680c1b74de 100644
--- a/llvm/test/CodeGen/X86/cmp-bool.ll
+++ b/llvm/test/CodeGen/X86/cmp-bool.ll
@@ -4,7 +4,7 @@
 define void @bool_eq(i1 zeroext %a, i1 zeroext %b, ptr nocapture %c) nounwind {
 ; CHECK-LABEL: bool_eq:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    cmpb %sil, %dil
 ; CHECK-NEXT:    je .LBB0_2
 ; CHECK-NEXT:  # %bb.1: # %if.end
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/cmp-concat.ll b/llvm/test/CodeGen/X86/cmp-concat.ll
index 5e030de1409f22..816f16667734cc 100644
--- a/llvm/test/CodeGen/X86/cmp-concat.ll
+++ b/llvm/test/CodeGen/X86/cmp-concat.ll
@@ -4,7 +4,7 @@
 define i1 @cmp_allbits_concat_i8(i8 %x, i8 %y) {
 ; CHECK-LABEL: cmp_allbits_concat_i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    andb %sil, %dil
 ; CHECK-NEXT:    cmpb $-1, %dil
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 2f4fef82f1f17a..9de0618a3aa022 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -21,7 +21,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
 ; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NOCMOV-NEXT:    retl
 ; X86-NOCMOV-NEXT:  .LBB0_1:
@@ -37,7 +37,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X86-CMOV-NEXT:    bsrl %eax, %ecx
 ; X86-CMOV-NEXT:    movl $15, %eax
 ; X86-CMOV-NEXT:    cmovnel %ecx, %eax
-; X86-CMOV-NEXT:    xorl $7, %eax
+; X86-CMOV-NEXT:    xorb $7, %al
 ; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CMOV-NEXT:    retl
 ;
@@ -48,7 +48,7 @@ define i8 @ctlo_i8(i8 %x) {
 ; X64-NEXT:    bsrl %eax, %ecx
 ; X64-NEXT:    movl $15, %eax
 ; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -58,14 +58,16 @@ define i8 @ctlo_i8(i8 %x) {
 ; X86-CLZ-NEXT:    shll $24, %eax
 ; X86-CLZ-NEXT:    notl %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
+; X86-CLZ-NEXT:    addb $-24, %al
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
 ; X64-CLZ-LABEL: ctlo_i8:
 ; X64-CLZ:       # %bb.0:
-; X64-CLZ-NEXT:    shll $24, %edi
-; X64-CLZ-NEXT:    notl %edi
-; X64-CLZ-NEXT:    lzcntl %edi, %eax
+; X64-CLZ-NEXT:    notb %dil
+; X64-CLZ-NEXT:    movzbl %dil, %eax
+; X64-CLZ-NEXT:    lzcntl %eax, %eax
+; X64-CLZ-NEXT:    addb $-24, %al
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
   %tmp1 = xor i8 %x, -1
@@ -80,7 +82,7 @@ define i8 @ctlo_i8_undef(i8 %x) {
 ; X86-NEXT:    notb %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -89,7 +91,7 @@ define i8 @ctlo_i8_undef(i8 %x) {
 ; X64-NEXT:    notb %dil
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/ctlz.ll b/llvm/test/CodeGen/X86/ctlz.ll
index 68defaff78d37d..e7a11cf443f5e0 100644
--- a/llvm/test/CodeGen/X86/ctlz.ll
+++ b/llvm/test/CodeGen/X86/ctlz.ll
@@ -17,7 +17,7 @@ define i8 @ctlz_i8(i8 %x) {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -25,7 +25,7 @@ define i8 @ctlz_i8(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -226,7 +226,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
 ; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NOCMOV-NEXT:    retl
 ; X86-NOCMOV-NEXT:  .LBB4_1:
@@ -240,7 +240,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-CMOV-NEXT:    bsrl %eax, %ecx
 ; X86-CMOV-NEXT:    movl $15, %eax
 ; X86-CMOV-NEXT:    cmovnel %ecx, %eax
-; X86-CMOV-NEXT:    xorl $7, %eax
+; X86-CMOV-NEXT:    xorb $7, %al
 ; X86-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CMOV-NEXT:    retl
 ;
@@ -250,7 +250,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X64-NEXT:    bsrl %eax, %ecx
 ; X64-NEXT:    movl $15, %eax
 ; X64-NEXT:    cmovnel %ecx, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -258,7 +258,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:    addl $-24, %eax
+; X86-CLZ-NEXT:    addb $-24, %al
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
 ;
@@ -266,7 +266,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X64-CLZ:       # %bb.0:
 ; X64-CLZ-NEXT:    movzbl %dil, %eax
 ; X64-CLZ-NEXT:    lzcntl %eax, %eax
-; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    addb $-24, %al
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
 ;
@@ -274,7 +274,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X64-FASTLZCNT:       # %bb.0:
 ; X64-FASTLZCNT-NEXT:    movzbl %dil, %eax
 ; X64-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT:    addl $-24, %eax
+; X64-FASTLZCNT-NEXT:    addb $-24, %al
 ; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-FASTLZCNT-NEXT:    retq
 ;
@@ -282,7 +282,7 @@ define i8 @ctlz_i8_zero_test(i8 %n) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    addl $-24, %eax
+; X86-FASTLZCNT-NEXT:    addb $-24, %al
 ; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-FASTLZCNT-NEXT:    retl
   %tmp1 = call i8 @llvm.ctlz.i8(i8 %n, i1 false)
@@ -653,7 +653,7 @@ define i8 @ctlz_i8_knownbits(i8 %x)  {
 ; X86-NEXT:    orb $64, %al
 ; X86-NEXT:    movzbl %al, %eax
 ; X86-NEXT:    bsrl %eax, %eax
-; X86-NEXT:    xorl $7, %eax
+; X86-NEXT:    xorb $7, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -662,7 +662,7 @@ define i8 @ctlz_i8_knownbits(i8 %x)  {
 ; X64-NEXT:    orb $64, %dil
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    bsrl %eax, %eax
-; X64-NEXT:    xorl $7, %eax
+; X64-NEXT:    xorb $7, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 ;
@@ -962,7 +962,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-NOCMOV-NEXT:  # %bb.2: # %cond.false
 ; X86-NOCMOV-NEXT:    movzbl %al, %eax
 ; X86-NOCMOV-NEXT:    bsrl %eax, %eax
-; X86-NOCMOV-NEXT:    xorl $7, %eax
+; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    xorb $7, %al
 ; X86-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NOCMOV-NEXT:    retl
@@ -994,7 +994,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-CLZ:       # %bb.0:
 ; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
-; X86-CLZ-NEXT:    addl $-24, %eax
+; X86-CLZ-NEXT:    addb $-24, %al
 ; X86-CLZ-NEXT:    xorb $7, %al
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-CLZ-NEXT:    retl
@@ -1003,7 +1003,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X64-CLZ:       # %bb.0:
 ; X64-CLZ-NEXT:    movzbl %dil, %eax
 ; X64-CLZ-NEXT:    lzcntl %eax, %eax
-; X64-CLZ-NEXT:    addl $-24, %eax
+; X64-CLZ-NEXT:    addb $-24, %al
 ; X64-CLZ-NEXT:    xorb $7, %al
 ; X64-CLZ-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-CLZ-NEXT:    retq
@@ -1012,7 +1012,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X64-FASTLZCNT:       # %bb.0:
 ; X64-FASTLZCNT-NEXT:    movzbl %dil, %eax
 ; X64-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X64-FASTLZCNT-NEXT:    addl $-24, %eax
+; X64-FASTLZCNT-NEXT:    addb $-24, %al
 ; X64-FASTLZCNT-NEXT:    xorb $7, %al
 ; X64-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-FASTLZCNT-NEXT:    retq
@@ -1021,7 +1021,7 @@ define i8 @ctlz_xor7_i8_false(i8 %x) {
 ; X86-FASTLZCNT:       # %bb.0:
 ; X86-FASTLZCNT-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-FASTLZCNT-NEXT:    lzcntl %eax, %eax
-; X86-FASTLZCNT-NEXT:    addl $-24, %eax
+; X86-FASTLZCNT-NEXT:    addb $-24, %al
 ; X86-FASTLZCNT-NEXT:    xorb $7, %al
 ; X86-FASTLZCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-FASTLZCNT-NEXT:    retl

>From d2059c7001fae35359b208945211e676ae6d091f Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 19:54:31 -0700
Subject: [PATCH 34/56] Update more tests

---
 llvm/test/CodeGen/X86/apx/or.ll               |  6 +-
 llvm/test/CodeGen/X86/apx/xor.ll              | 14 ++-
 .../CodeGen/X86/dag-update-nodetomatch.ll     | 30 +++---
 llvm/test/CodeGen/X86/fast-isel-fcmp.ll       |  8 +-
 llvm/test/CodeGen/X86/fptosi-sat-scalar.ll    | 16 ++--
 llvm/test/CodeGen/X86/fptoui-sat-scalar.ll    | 16 ++--
 llvm/test/CodeGen/X86/isel-and.ll             |  4 +-
 llvm/test/CodeGen/X86/isel-or.ll              |  4 +-
 llvm/test/CodeGen/X86/isel-xor.ll             |  4 +-
 llvm/test/CodeGen/X86/jump_sign.ll            |  3 +-
 llvm/test/CodeGen/X86/known-pow2.ll           |  2 +-
 llvm/test/CodeGen/X86/large-pic-string.ll     |  2 +-
 llvm/test/CodeGen/X86/logic-shift.ll          | 20 ++--
 llvm/test/CodeGen/X86/lzcnt.ll                |  6 +-
 llvm/test/CodeGen/X86/masked_store_trunc.ll   | 90 +++++++++---------
 .../CodeGen/X86/masked_store_trunc_ssat.ll    | 90 +++++++++---------
 .../CodeGen/X86/masked_store_trunc_usat.ll    | 94 +++++++++----------
 llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll |  6 +-
 llvm/test/CodeGen/X86/mixed-ptr-sizes.ll      |  4 +-
 llvm/test/CodeGen/X86/or-with-overflow.ll     |  4 +-
 llvm/test/CodeGen/X86/parity-vec.ll           | 14 +--
 llvm/test/CodeGen/X86/parity.ll               |  7 +-
 llvm/test/CodeGen/X86/pr14088.ll              |  8 +-
 llvm/test/CodeGen/X86/pr15267.ll              | 29 +++---
 llvm/test/CodeGen/X86/pr35761.ll              |  8 +-
 llvm/test/CodeGen/X86/pr40539.ll              |  2 +-
 .../CodeGen/X86/replace-load-and-with-bzhi.ll |  4 +-
 llvm/test/CodeGen/X86/sdiv_fix_sat.ll         |  8 +-
 llvm/test/CodeGen/X86/setcc-logic.ll          | 12 +--
 llvm/test/CodeGen/X86/setoeq.ll               |  4 +-
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    | 12 +--
 .../statepoint-spill-slot-size-promotion.ll   |  2 +-
 ...asked-merge-scalar-constmask-innerouter.ll | 16 ++--
 ...-scalar-constmask-interleavedbytehalves.ll | 16 ++--
 ...d-masked-merge-scalar-constmask-lowhigh.ll | 16 ++--
 ...unfold-masked-merge-vector-variablemask.ll | 87 ++++++++---------
 .../test/CodeGen/X86/vector-compare-all_of.ll |  4 +-
 .../test/CodeGen/X86/vector-compare-any_of.ll |  2 +-
 llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll | 12 +--
 llvm/test/CodeGen/X86/xor-with-overflow.ll    |  4 +-
 .../LoopStrengthReduce/X86/ivchain-X86.ll     | 42 ++++-----
 41 files changed, 353 insertions(+), 379 deletions(-)

diff --git a/llvm/test/CodeGen/X86/apx/or.ll b/llvm/test/CodeGen/X86/apx/or.ll
index e51ba9d9bf0391..593c16dea09a28 100644
--- a/llvm/test/CodeGen/X86/apx/or.ll
+++ b/llvm/test/CodeGen/X86/apx/or.ll
@@ -5,14 +5,12 @@
 define i8 @or8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: or8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    orl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x09,0xf7]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    orb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x08,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: or8rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} orl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x09,0xf7]
-; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    {nf} orb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x08,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %or = or i8 %a, %b
diff --git a/llvm/test/CodeGen/X86/apx/xor.ll b/llvm/test/CodeGen/X86/apx/xor.ll
index d908849e2848f3..4b67a4af7a6423 100644
--- a/llvm/test/CodeGen/X86/apx/xor.ll
+++ b/llvm/test/CodeGen/X86/apx/xor.ll
@@ -5,14 +5,12 @@
 define i8 @xor8rr(i8 noundef %a, i8 noundef %b) {
 ; CHECK-LABEL: xor8rr:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    xorl %esi, %edi, %eax # encoding: [0x62,0xf4,0x7c,0x18,0x31,0xf7]
-; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    xorb %sil, %dil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0xf7]
 ; CHECK-NEXT:    retq # encoding: [0xc3]
 ;
 ; NF-LABEL: xor8rr:
 ; NF:       # %bb.0: # %entry
-; NF-NEXT:    {nf} xorl %esi, %edi, %eax # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x31,0xf7]
-; NF-NEXT:    # kill: def $al killed $al killed $eax
+; NF-NEXT:    {nf} xorb %sil, %dil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0xf7]
 ; NF-NEXT:    retq # encoding: [0xc3]
 entry:
     %xor = xor i8 %a, %b
@@ -428,8 +426,8 @@ entry:
 define i1 @xorflag8rr(i8 %a, i8 %b) {
 ; CHECK-LABEL: xorflag8rr:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe]
-; CHECK-NEXT:    xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff]
+; CHECK-NEXT:    xorb %dil, %sil, %al # encoding: [0x62,0xf4,0x7c,0x18,0x30,0xfe]
+; CHECK-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
 ; CHECK-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; CHECK-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; CHECK-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
@@ -437,8 +435,8 @@ define i1 @xorflag8rr(i8 %a, i8 %b) {
 ;
 ; NF-LABEL: xorflag8rr:
 ; NF:       # %bb.0:
-; NF-NEXT:    xorl %edi, %esi # EVEX TO LEGACY Compression encoding: [0x31,0xfe]
-; NF-NEXT:    xorb $-1, %sil, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf6,0xff]
+; NF-NEXT:    {nf} xorb %dil, %sil, %al # EVEX TO EVEX Compression encoding: [0x62,0xf4,0x7c,0x1c,0x30,0xfe]
+; NF-NEXT:    xorb $-1, %al, %cl # encoding: [0x62,0xf4,0x74,0x18,0x80,0xf0,0xff]
 ; NF-NEXT:    sete %al # encoding: [0x0f,0x94,0xc0]
 ; NF-NEXT:    movb %cl, d64(%rip) # encoding: [0x88,0x0d,A,A,A,A]
 ; NF-NEXT:    # fixup A - offset: 2, value: d64-4, kind: reloc_riprel_4byte
diff --git a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
index adb7319fe80b11..07d443d30374c7 100644
--- a/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
+++ b/llvm/test/CodeGen/X86/dag-update-nodetomatch.ll
@@ -24,25 +24,25 @@ define void @_Z1nv() local_unnamed_addr {
 ; CHECK-LABEL: _Z1nv:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movq k at GOTPCREL(%rip), %rax
-; CHECK-NEXT:    movl 4(%rax), %edx
+; CHECK-NEXT:    movl 4(%rax), %ecx
 ; CHECK-NEXT:    movq c at GOTPCREL(%rip), %rax
-; CHECK-NEXT:    movswl (%rax), %ecx
+; CHECK-NEXT:    movswl (%rax), %esi
 ; CHECK-NEXT:    movq b at GOTPCREL(%rip), %rax
 ; CHECK-NEXT:    movswl (%rax), %edi
-; CHECK-NEXT:    movq a at GOTPCREL(%rip), %rsi
-; CHECK-NEXT:    movl (%rsi), %esi
+; CHECK-NEXT:    movq a at GOTPCREL(%rip), %rdx
+; CHECK-NEXT:    movl (%rdx), %edx
 ; CHECK-NEXT:    movq l at GOTPCREL(%rip), %r8
 ; CHECK-NEXT:    movl (%r8), %r8d
 ; CHECK-NEXT:    movl %r8d, %r9d
 ; CHECK-NEXT:    shll $7, %r9d
 ; CHECK-NEXT:    sarl $7, %r9d
 ; CHECK-NEXT:    negl %r9d
-; CHECK-NEXT:    testl %esi, %esi
-; CHECK-NEXT:    cmovel %esi, %r9d
-; CHECK-NEXT:    movzwl %dx, %r10d
-; CHECK-NEXT:    leal (%rcx,%r10,2), %ecx
-; CHECK-NEXT:    addl %edi, %ecx
-; CHECK-NEXT:    cmpl %r9d, %ecx
+; CHECK-NEXT:    testl %edx, %edx
+; CHECK-NEXT:    cmovel %edx, %r9d
+; CHECK-NEXT:    movzwl %cx, %r10d
+; CHECK-NEXT:    leal (%rsi,%r10,2), %esi
+; CHECK-NEXT:    addl %edi, %esi
+; CHECK-NEXT:    cmpl %r9d, %esi
 ; CHECK-NEXT:    sete %dil
 ; CHECK-NEXT:    testl $33554431, %r8d # imm = 0x1FFFFFF
 ; CHECK-NEXT:    sete %r8b
@@ -50,12 +50,12 @@ define void @_Z1nv() local_unnamed_addr {
 ; CHECK-NEXT:    movzbl %r8b, %edi
 ; CHECK-NEXT:    movq e at GOTPCREL(%rip), %r8
 ; CHECK-NEXT:    movw %di, (%r8)
-; CHECK-NEXT:    notl %ecx
-; CHECK-NEXT:    shrl $31, %ecx
-; CHECK-NEXT:    addl %edx, %ecx
+; CHECK-NEXT:    notl %esi
+; CHECK-NEXT:    shrl $31, %esi
+; CHECK-NEXT:    addb %sil, %cl
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
-; CHECK-NEXT:    sarl %cl, %esi
-; CHECK-NEXT:    movw %si, (%rax)
+; CHECK-NEXT:    sarl %cl, %edx
+; CHECK-NEXT:    movw %dx, (%rax)
 ; CHECK-NEXT:    retq
 entry:
   %bf.load = load i32, ptr getelementptr inbounds (%struct.m, ptr @k, i64 0, i32 0, i32 1), align 4
diff --git a/llvm/test/CodeGen/X86/fast-isel-fcmp.ll b/llvm/test/CodeGen/X86/fast-isel-fcmp.ll
index b9ef3154cd1c3a..b56e2c0faa439a 100644
--- a/llvm/test/CodeGen/X86/fast-isel-fcmp.ll
+++ b/llvm/test/CodeGen/X86/fast-isel-fcmp.ll
@@ -9,7 +9,7 @@ define zeroext i1 @fcmp_oeq(float %x, float %y) {
 ; SDAG:       ## %bb.0:
 ; SDAG-NEXT:    cmpeqss %xmm1, %xmm0
 ; SDAG-NEXT:    movd %xmm0, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
@@ -327,7 +327,7 @@ define zeroext i1 @fcmp_une(float %x, float %y) {
 ; SDAG:       ## %bb.0:
 ; SDAG-NEXT:    cmpneqss %xmm1, %xmm0
 ; SDAG-NEXT:    movd %xmm0, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
@@ -553,7 +553,7 @@ define zeroext i1 @fcmp_oeq3(float %x) {
 ; SDAG-NEXT:    xorps %xmm1, %xmm1
 ; SDAG-NEXT:    cmpeqss %xmm0, %xmm1
 ; SDAG-NEXT:    movd %xmm1, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
@@ -1165,7 +1165,7 @@ define zeroext i1 @fcmp_une3(float %x) {
 ; SDAG-NEXT:    xorps %xmm1, %xmm1
 ; SDAG-NEXT:    cmpneqss %xmm0, %xmm1
 ; SDAG-NEXT:    movd %xmm1, %eax
-; SDAG-NEXT:    andl $1, %eax
+; SDAG-NEXT:    andb $1, %al
 ; SDAG-NEXT:    ## kill: def $al killed $al killed $eax
 ; SDAG-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
index 04fce7badb9514..76dd49eb72c956 100644
--- a/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptosi-sat-scalar.ll
@@ -763,7 +763,7 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -816,7 +816,7 @@ define i100 @test_signed_i100_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
@@ -1777,7 +1777,7 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -1830,7 +1830,7 @@ define i100 @test_signed_i100_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
@@ -2957,7 +2957,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -3017,7 +3017,7 @@ define i100 @test_signed_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
@@ -4288,7 +4288,7 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %ebx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %eax, (%ecx)
-; X86-X87-NEXT:    andl $15, %edx
+; X86-X87-NEXT:    andb $15, %dl
 ; X86-X87-NEXT:    movb %dl, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -4349,7 +4349,7 @@ define i100 @test_signed_i100_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl %edi, 8(%esi)
 ; X86-SSE-NEXT:    movl %edx, 4(%esi)
 ; X86-SSE-NEXT:    movl %ecx, (%esi)
-; X86-SSE-NEXT:    andl $15, %eax
+; X86-SSE-NEXT:    andb $15, %al
 ; X86-SSE-NEXT:    movb %al, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $44, %esp
diff --git a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
index fefc92c313511b..8e705121f6976d 100644
--- a/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
+++ b/llvm/test/CodeGen/X86/fptoui-sat-scalar.ll
@@ -720,7 +720,7 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $44, %esp
@@ -768,7 +768,7 @@ define i100 @test_unsigned_i100_f32(float %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $32, %esp
@@ -1644,7 +1644,7 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $44, %esp
@@ -1692,7 +1692,7 @@ define i100 @test_unsigned_i100_f64(double %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $32, %esp
@@ -2723,7 +2723,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $44, %esp
@@ -2778,7 +2778,7 @@ define i100 @test_unsigned_i100_f16(half %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $32, %esp
@@ -3938,7 +3938,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-X87-NEXT:    movl %edx, 8(%ecx)
 ; X86-X87-NEXT:    movl %ebp, 4(%ecx)
 ; X86-X87-NEXT:    movl %edi, (%ecx)
-; X86-X87-NEXT:    andl $15, %eax
+; X86-X87-NEXT:    andb $15, %al
 ; X86-X87-NEXT:    movb %al, 12(%ecx)
 ; X86-X87-NEXT:    movl %ecx, %eax
 ; X86-X87-NEXT:    addl $60, %esp
@@ -3992,7 +3992,7 @@ define i100 @test_unsigned_i100_f80(x86_fp80 %f) nounwind {
 ; X86-SSE-NEXT:    movl %eax, 8(%esi)
 ; X86-SSE-NEXT:    movl %ecx, 4(%esi)
 ; X86-SSE-NEXT:    movl %edx, (%esi)
-; X86-SSE-NEXT:    andl $15, %ebx
+; X86-SSE-NEXT:    andb $15, %bl
 ; X86-SSE-NEXT:    movb %bl, 12(%esi)
 ; X86-SSE-NEXT:    movl %esi, %eax
 ; X86-SSE-NEXT:    addl $48, %esp
diff --git a/llvm/test/CodeGen/X86/isel-and.ll b/llvm/test/CodeGen/X86/isel-and.ll
index 8db8060a0b9c71..86538ccb837925 100644
--- a/llvm/test/CodeGen/X86/isel-and.ll
+++ b/llvm/test/CodeGen/X86/isel-and.ll
@@ -30,7 +30,7 @@ define i1 @and_i1(i1 %a, i1 %b) {
 ; SDAG-X64-LABEL: and_i1:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    andl %esi, %eax
+; SDAG-X64-NEXT:    andb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
@@ -74,7 +74,7 @@ define i8 @and_i8(i8 %a, i8 %b) {
 ; SDAG-X64-LABEL: and_i8:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    andl %esi, %eax
+; SDAG-X64-NEXT:    andb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/isel-or.ll b/llvm/test/CodeGen/X86/isel-or.ll
index ad11e4e5bd3b15..ffa805014e8cbc 100644
--- a/llvm/test/CodeGen/X86/isel-or.ll
+++ b/llvm/test/CodeGen/X86/isel-or.ll
@@ -30,7 +30,7 @@ define i1 @or_i1(i1 %a, i1 %b) {
 ; SDAG-X64-LABEL: or_i1:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    orl %esi, %eax
+; SDAG-X64-NEXT:    orb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
@@ -75,7 +75,7 @@ define i8 @or_i8(i8 %a, i8 %b) {
 ; SDAG-X64-LABEL: or_i8:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    orl %esi, %eax
+; SDAG-X64-NEXT:    orb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/isel-xor.ll b/llvm/test/CodeGen/X86/isel-xor.ll
index 979993cb7bdcf4..5fd3817626edc6 100644
--- a/llvm/test/CodeGen/X86/isel-xor.ll
+++ b/llvm/test/CodeGen/X86/isel-xor.ll
@@ -30,7 +30,7 @@ define i1 @xor_i1(i1 %a, i1 %b) {
 ; SDAG-X64-LABEL: xor_i1:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    xorl %esi, %eax
+; SDAG-X64-NEXT:    xorb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
@@ -75,7 +75,7 @@ define i8 @xor_i8(i8 %a, i8 %b) {
 ; SDAG-X64-LABEL: xor_i8:
 ; SDAG-X64:       # %bb.0:
 ; SDAG-X64-NEXT:    movl %edi, %eax
-; SDAG-X64-NEXT:    xorl %esi, %eax
+; SDAG-X64-NEXT:    xorb %sil, %al
 ; SDAG-X64-NEXT:    # kill: def $al killed $al killed $eax
 ; SDAG-X64-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/jump_sign.ll b/llvm/test/CodeGen/X86/jump_sign.ll
index 9eaa65442a727f..e807d806b165b5 100644
--- a/llvm/test/CodeGen/X86/jump_sign.ll
+++ b/llvm/test/CodeGen/X86/jump_sign.ll
@@ -388,11 +388,10 @@ define i32 @func_test1(i32 %p1) nounwind uwtable {
 ; CHECK-LABEL: func_test1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl b, %eax
-; CHECK-NEXT:    xorl %ecx, %ecx
 ; CHECK-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; CHECK-NEXT:    setb %cl
 ; CHECK-NEXT:    movl a, %eax
-; CHECK-NEXT:    testl %eax, %ecx
+; CHECK-NEXT:    testb %al, %cl
 ; CHECK-NEXT:    je .LBB18_2
 ; CHECK-NEXT:  # %bb.1: # %if.then
 ; CHECK-NEXT:    decl %eax
diff --git a/llvm/test/CodeGen/X86/known-pow2.ll b/llvm/test/CodeGen/X86/known-pow2.ll
index e183bbc15617d5..3220a95a6134ce 100644
--- a/llvm/test/CodeGen/X86/known-pow2.ll
+++ b/llvm/test/CodeGen/X86/known-pow2.ll
@@ -104,7 +104,7 @@ define i1 @pow2_srl(i32 %x, i32 %y) {
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shll %cl, %eax
 ; CHECK-NEXT:    shrl $20, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %yy = and i32 %y, 7
diff --git a/llvm/test/CodeGen/X86/large-pic-string.ll b/llvm/test/CodeGen/X86/large-pic-string.ll
index 5e7cdbb93dc883..2a2c46e501175f 100644
--- a/llvm/test/CodeGen/X86/large-pic-string.ll
+++ b/llvm/test/CodeGen/X86/large-pic-string.ll
@@ -12,7 +12,7 @@ define void @pr38385() {
 ; CHECK-NEXT:    movabsq $_GLOBAL_OFFSET_TABLE_-.L0$pb, %rcx
 ; CHECK-NEXT:    addq %rax, %rcx
 ; CHECK-NEXT:    movabsq $.L.str at GOTOFF, %rax
-; CHECK-NEXT:    addl %eax, %ecx
+; CHECK-NEXT:    addb %al, %cl
 ; CHECK-NEXT:    movb %cl, -{{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    retq
   %p = alloca i8, align 1
diff --git a/llvm/test/CodeGen/X86/logic-shift.ll b/llvm/test/CodeGen/X86/logic-shift.ll
index 96e63d1122ec92..7eac8534d06b20 100644
--- a/llvm/test/CodeGen/X86/logic-shift.ll
+++ b/llvm/test/CodeGen/X86/logic-shift.ll
@@ -6,7 +6,7 @@ define i8 @or_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    orb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil
 ; CHECK-NEXT:    orb %dil, %al
@@ -178,7 +178,7 @@ define i8 @or_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    orl %esi, %edi
+; CHECK-NEXT:    orb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    orb %dil, %al
@@ -290,7 +290,7 @@ define i8 @xor_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil
 ; CHECK-NEXT:    xorb %dil, %al
@@ -462,7 +462,7 @@ define i8 @xor_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    xorl %esi, %edi
+; CHECK-NEXT:    xorb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    xorb %dil, %al
@@ -574,7 +574,7 @@ define i8 @and_lshr_commute0(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    andb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shrb %cl, %dil
 ; CHECK-NEXT:    andb %dil, %al
@@ -746,7 +746,7 @@ define i8 @and_shl_commute1(i8 %x0, i8 %x1, i8 %y, i8 %z) {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %ecx, %eax
 ; CHECK-NEXT:    movl %edx, %ecx
-; CHECK-NEXT:    andl %esi, %edi
+; CHECK-NEXT:    andb %sil, %dil
 ; CHECK-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; CHECK-NEXT:    shlb %cl, %dil
 ; CHECK-NEXT:    andb %dil, %al
@@ -902,9 +902,8 @@ define i8 @or_fshl_commute3(i8 %x, i8 %y) {
 ; CHECK-LABEL: or_fshl_commute3:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %edi, %esi
 ; CHECK-NEXT:    shlb $5, %sil
-; CHECK-NEXT:    shrb $3, %al
+; CHECK-NEXT:    rolb $5, %al
 ; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
@@ -979,10 +978,9 @@ define i16 @or_fshr_commute2(i16 %x, i16 %y) {
 define i8 @or_fshr_commute3(i8 %x, i8 %y) {
 ; CHECK-LABEL: or_fshr_commute3:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    # kill: def $edi killed $edi def $rdi
-; CHECK-NEXT:    orl %edi, %esi
+; CHECK-NEXT:    movl %edi, %eax
 ; CHECK-NEXT:    shrb $6, %sil
-; CHECK-NEXT:    leal (,%rdi,4), %eax
+; CHECK-NEXT:    rolb $2, %al
 ; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/lzcnt.ll b/llvm/test/CodeGen/X86/lzcnt.ll
index b0004019734168..3cdb9eae354703 100644
--- a/llvm/test/CodeGen/X86/lzcnt.ll
+++ b/llvm/test/CodeGen/X86/lzcnt.ll
@@ -13,7 +13,7 @@ define i8 @t1(i8 %x) nounwind  {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    lzcntl %eax, %eax
-; X86-NEXT:    addl $-24, %eax
+; X86-NEXT:    addb $-24, %al
 ; X86-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-NEXT:    retl
 ;
@@ -21,7 +21,7 @@ define i8 @t1(i8 %x) nounwind  {
 ; X32:       # %bb.0:
 ; X32-NEXT:    movzbl %dil, %eax
 ; X32-NEXT:    lzcntl %eax, %eax
-; X32-NEXT:    addl $-24, %eax
+; X32-NEXT:    addb $-24, %al
 ; X32-NEXT:    # kill: def $al killed $al killed $eax
 ; X32-NEXT:    retq
 ;
@@ -29,7 +29,7 @@ define i8 @t1(i8 %x) nounwind  {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movzbl %dil, %eax
 ; X64-NEXT:    lzcntl %eax, %eax
-; X64-NEXT:    addl $-24, %eax
+; X64-NEXT:    addb $-24, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
 	%tmp = tail call i8 @llvm.ctlz.i8( i8 %x, i1 false )
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f4a0207dafde7c..77ca673d5b82ae 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -18,7 +18,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB0_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -91,7 +91,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB0_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -223,7 +223,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -304,7 +304,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB1_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -378,7 +378,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -449,7 +449,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB1_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -612,7 +612,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB2_1
@@ -688,7 +688,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB2_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -763,7 +763,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -836,7 +836,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -988,7 +988,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    movmskps %xmm3, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1027,7 +1027,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm1[0,2]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE4-NEXT:    movmskps %xmm3, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB3_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1127,7 +1127,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB4_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1170,7 +1170,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packusdw %xmm3, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB4_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1210,7 +1210,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB4_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1252,7 +1252,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB4_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1358,7 +1358,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm3, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB5_1
@@ -1401,7 +1401,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm1[0],xmm0[1],xmm1[1],xmm0[2],xmm1[2],xmm0[3],xmm1[3]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE4-NEXT:    movmskps %xmm3, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB5_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1441,7 +1441,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB5_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1483,7 +1483,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpunpcklwd {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1],xmm0[2],xmm3[2],xmm0[3],xmm3[3]
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1586,7 +1586,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB6_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1609,7 +1609,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm2
 ; SSE4-NEXT:    movmskpd %xmm2, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB6_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1690,7 +1690,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB7_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1715,7 +1715,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm2
 ; SSE4-NEXT:    movmskpd %xmm2, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB7_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1738,7 +1738,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB7_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -1813,7 +1813,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB8_1
@@ -1836,7 +1836,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm2
 ; SSE4-NEXT:    movmskpd %xmm2, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB8_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1858,7 +1858,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB8_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -3281,7 +3281,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB11_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -3358,7 +3358,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB11_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -3427,7 +3427,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB11_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -3494,7 +3494,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpermq {{.*#+}} ymm0 = ymm0[0,2,2,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB11_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -3658,7 +3658,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB12_1
@@ -3730,7 +3730,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB12_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -3801,7 +3801,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB12_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -3871,7 +3871,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpunpckldq {{.*#+}} xmm0 = xmm0[0],xmm3[0],xmm0[1],xmm3[1]
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB12_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4030,7 +4030,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB13_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4070,7 +4070,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB13_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4106,7 +4106,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,1,4,5,8,9,12,13,8,9,12,13,12,13,14,15]
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB13_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -4204,7 +4204,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB14_1
@@ -4244,7 +4244,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB14_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4280,7 +4280,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,4,8,12,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB14_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -6210,7 +6210,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE2-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB17_1
@@ -6276,7 +6276,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE4-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB17_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -6341,7 +6341,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpmovmskb %xmm1, %eax
-; AVX-NEXT:    notl %eax
+; AVX-NEXT:    notb %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB17_1
 ; AVX-NEXT:  # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 487f7298f442c2..97431310604edb 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -114,7 +114,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    je .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %cond.store
@@ -221,7 +221,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB0_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -495,7 +495,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -605,7 +605,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB1_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -694,7 +694,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -773,7 +773,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB1_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1043,7 +1043,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB2_1
@@ -1147,7 +1147,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB2_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1237,7 +1237,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1317,7 +1317,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1531,7 +1531,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    shufps {{.*#+}} xmm0 = xmm0[0,2],xmm6[0,2]
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1587,7 +1587,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm3[0,2]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm4
 ; SSE4-NEXT:    movmskps %xmm4, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB3_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1763,7 +1763,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB4_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1821,7 +1821,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm1, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm4
 ; SSE4-NEXT:    movmskps %xmm4, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB4_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1869,7 +1869,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB4_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1915,7 +1915,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB4_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -2078,7 +2078,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm4, %eax
 ; SSE2-NEXT:    jne .LBB5_1
@@ -2137,7 +2137,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packsswb %xmm1, %xmm1
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm4
 ; SSE4-NEXT:    movmskps %xmm4, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB5_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2186,7 +2186,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB5_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -2233,7 +2233,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -2368,7 +2368,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    movmskpd %xmm2, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB6_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2400,7 +2400,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB6_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2529,7 +2529,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm2 = xmm1[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm1, %xmm2
 ; SSE2-NEXT:    movmskpd %xmm2, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB7_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2563,7 +2563,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB7_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2592,7 +2592,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB7_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -2698,7 +2698,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm1, %xmm0
 ; SSE2-NEXT:    movmskpd %xmm0, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm3, %ecx
 ; SSE2-NEXT:    jne .LBB8_1
@@ -2730,7 +2730,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB8_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2758,7 +2758,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB8_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -4174,7 +4174,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB11_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4249,7 +4249,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB11_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4317,7 +4317,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB11_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4384,7 +4384,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackssdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB11_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4551,7 +4551,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB12_1
@@ -4620,7 +4620,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB12_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4689,7 +4689,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB12_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4757,7 +4757,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB12_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4922,7 +4922,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB13_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4962,7 +4962,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB13_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4998,7 +4998,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpackssdw %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB13_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -5101,7 +5101,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm0, %eax
 ; SSE2-NEXT:    jne .LBB14_1
@@ -5142,7 +5142,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packsswb %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB14_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -5179,7 +5179,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
 ; AVX-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskps %xmm1, %eax
-; AVX-NEXT:    xorl $15, %eax
+; AVX-NEXT:    xorb $15, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB14_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -7105,7 +7105,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE2-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB17_1
@@ -7171,7 +7171,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE4-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB17_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -7236,7 +7236,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpmovmskb %xmm1, %eax
-; AVX-NEXT:    notl %eax
+; AVX-NEXT:    notb %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB17_1
 ; AVX-NEXT:  # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 498f250f11c690..b2a73a25ffc0a2 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -56,7 +56,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    je .LBB0_2
 ; SSE2-NEXT:  # %bb.1: # %cond.store
@@ -156,7 +156,7 @@ define void @truncstore_v8i64_v8i32(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB0_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -368,7 +368,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB1_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -471,7 +471,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB1_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -560,7 +560,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB1_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -638,7 +638,7 @@ define void @truncstore_v8i64_v8i16(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpshufd {{.*#+}} xmm0 = xmm0[0,2,1,3]
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB1_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -846,7 +846,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm5, %xmm4
 ; SSE2-NEXT:    packsswb %xmm4, %xmm4
 ; SSE2-NEXT:    pmovmskb %xmm4, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm1, %ecx
 ; SSE2-NEXT:    jne .LBB2_1
@@ -943,7 +943,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm5, %xmm4
 ; SSE4-NEXT:    packsswb %xmm4, %xmm4
 ; SSE4-NEXT:    pmovmskb %xmm4, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB2_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1033,7 +1033,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm2, %xmm2
 ; AVX1-NEXT:    vinsertf128 $1, %xmm1, %ymm2, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB2_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1112,7 +1112,7 @@ define void @truncstore_v8i64_v8i8(<8 x i64> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm3, %ymm2, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB2_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1292,7 +1292,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    shufps {{.*#+}} xmm1 = xmm1[0,2],xmm5[0,2]
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    movmskps %xmm3, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB3_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1345,7 +1345,7 @@ define void @truncstore_v4i64_v4i32(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    shufps {{.*#+}} xmm5 = xmm5[0,2],xmm8[0,2]
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm6
 ; SSE4-NEXT:    movmskps %xmm6, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB3_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1490,7 +1490,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm3, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB4_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -1545,7 +1545,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packusdw %xmm5, %xmm5
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm6
 ; SSE4-NEXT:    movmskps %xmm6, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB4_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1595,7 +1595,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB4_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1641,7 +1641,7 @@ define void @truncstore_v4i64_v4i16(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB4_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -1771,7 +1771,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm1, %xmm1
 ; SSE2-NEXT:    pcmpeqd %xmm2, %xmm3
 ; SSE2-NEXT:    movmskps %xmm3, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    jne .LBB5_1
@@ -1827,7 +1827,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packuswb %xmm5, %xmm5
 ; SSE4-NEXT:    pcmpeqd %xmm2, %xmm6
 ; SSE4-NEXT:    movmskps %xmm6, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB5_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -1878,7 +1878,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB5_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -1925,7 +1925,7 @@ define void @truncstore_v4i64_v4i8(<4 x i64> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB5_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -2042,7 +2042,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB6_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2070,7 +2070,7 @@ define void @truncstore_v2i64_v2i32(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[0,2,2,3]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB6_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2173,7 +2173,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB7_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -2203,7 +2203,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB7_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2229,7 +2229,7 @@ define void @truncstore_v2i64_v2i16(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshuflw {{.*#+}} xmm0 = xmm0[0,2,2,3,4,5,6,7]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB7_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -2316,7 +2316,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm2[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm2, %xmm0
 ; SSE2-NEXT:    movmskpd %xmm0, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm3, %ecx
 ; SSE2-NEXT:    jne .LBB8_1
@@ -2344,7 +2344,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; SSE4-NEXT:    pshufb {{.*#+}} xmm2 = xmm2[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; SSE4-NEXT:    pcmpeqq %xmm1, %xmm3
 ; SSE4-NEXT:    movmskpd %xmm3, %eax
-; SSE4-NEXT:    xorl $3, %eax
+; SSE4-NEXT:    xorb $3, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB8_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -2369,7 +2369,7 @@ define void @truncstore_v2i64_v2i8(<2 x i64> %x, ptr %p, <2 x i64> %mask) {
 ; AVX-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,8,u,u,u,u,u,u,u,u,u,u,u,u,u,u]
 ; AVX-NEXT:    vpcmpeqq %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vmovmskpd %xmm1, %eax
-; AVX-NEXT:    xorl $3, %eax
+; AVX-NEXT:    xorb $3, %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB8_1
 ; AVX-NEXT:  # %bb.2: # %else
@@ -3868,7 +3868,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB11_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -3946,7 +3946,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB11_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4017,7 +4017,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB11_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4086,7 +4086,7 @@ define void @truncstore_v8i32_v8i16(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm3, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB11_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4265,7 +4265,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE2-NEXT:    packssdw %xmm3, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm4, %ecx
 ; SSE2-NEXT:    jne .LBB12_1
@@ -4337,7 +4337,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; SSE4-NEXT:    packssdw %xmm3, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB12_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4409,7 +4409,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX1-NEXT:    vpcmpeqd %xmm3, %xmm1, %xmm1
 ; AVX1-NEXT:    vinsertf128 $1, %xmm2, %ymm1, %ymm1
 ; AVX1-NEXT:    vmovmskps %ymm1, %eax
-; AVX1-NEXT:    notl %eax
+; AVX1-NEXT:    notb %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB12_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4479,7 +4479,7 @@ define void @truncstore_v8i32_v8i8(<8 x i32> %x, ptr %p, <8 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %ymm2, %ymm1, %ymm1
 ; AVX2-NEXT:    vmovmskps %ymm1, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB12_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4648,7 +4648,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm0[0,2,2,3]
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %eax
-; SSE2-NEXT:    xorl $15, %eax
+; SSE2-NEXT:    xorb $15, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    jne .LBB13_1
 ; SSE2-NEXT:  # %bb.2: # %else
@@ -4689,7 +4689,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packusdw %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB13_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4726,7 +4726,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB13_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4764,7 +4764,7 @@ define void @truncstore_v4i32_v4i16(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackusdw %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB13_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -4873,7 +4873,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE2-NEXT:    packuswb %xmm3, %xmm3
 ; SSE2-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE2-NEXT:    movmskps %xmm2, %ecx
-; SSE2-NEXT:    xorl $15, %ecx
+; SSE2-NEXT:    xorb $15, %cl
 ; SSE2-NEXT:    testb $1, %cl
 ; SSE2-NEXT:    movd %xmm3, %eax
 ; SSE2-NEXT:    jne .LBB14_1
@@ -4915,7 +4915,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; SSE4-NEXT:    packuswb %xmm0, %xmm0
 ; SSE4-NEXT:    pcmpeqd %xmm1, %xmm2
 ; SSE4-NEXT:    movmskps %xmm2, %eax
-; SSE4-NEXT:    xorl $15, %eax
+; SSE4-NEXT:    xorb $15, %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB14_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -4953,7 +4953,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX1-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX1-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX1-NEXT:    vmovmskps %xmm1, %eax
-; AVX1-NEXT:    xorl $15, %eax
+; AVX1-NEXT:    xorb $15, %al
 ; AVX1-NEXT:    testb $1, %al
 ; AVX1-NEXT:    jne .LBB14_1
 ; AVX1-NEXT:  # %bb.2: # %else
@@ -4992,7 +4992,7 @@ define void @truncstore_v4i32_v4i8(<4 x i32> %x, ptr %p, <4 x i32> %mask) {
 ; AVX2-NEXT:    vpackuswb %xmm0, %xmm0, %xmm0
 ; AVX2-NEXT:    vpcmpeqd %xmm2, %xmm1, %xmm1
 ; AVX2-NEXT:    vmovmskps %xmm1, %eax
-; AVX2-NEXT:    xorl $15, %eax
+; AVX2-NEXT:    xorb $15, %al
 ; AVX2-NEXT:    testb $1, %al
 ; AVX2-NEXT:    jne .LBB14_1
 ; AVX2-NEXT:  # %bb.2: # %else
@@ -6953,7 +6953,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE2-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE2-NEXT:    packsswb %xmm2, %xmm2
 ; SSE2-NEXT:    pmovmskb %xmm2, %eax
-; SSE2-NEXT:    notl %eax
+; SSE2-NEXT:    notb %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    movd %xmm0, %ecx
 ; SSE2-NEXT:    jne .LBB17_1
@@ -7020,7 +7020,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; SSE4-NEXT:    pcmpeqw %xmm1, %xmm2
 ; SSE4-NEXT:    packsswb %xmm2, %xmm2
 ; SSE4-NEXT:    pmovmskb %xmm2, %eax
-; SSE4-NEXT:    notl %eax
+; SSE4-NEXT:    notb %al
 ; SSE4-NEXT:    testb $1, %al
 ; SSE4-NEXT:    jne .LBB17_1
 ; SSE4-NEXT:  # %bb.2: # %else
@@ -7086,7 +7086,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
 ; AVX-NEXT:    vpacksswb %xmm1, %xmm1, %xmm1
 ; AVX-NEXT:    vpmovmskb %xmm1, %eax
-; AVX-NEXT:    notl %eax
+; AVX-NEXT:    notb %al
 ; AVX-NEXT:    testb $1, %al
 ; AVX-NEXT:    jne .LBB17_1
 ; AVX-NEXT:  # %bb.2: # %else
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
index c997d314a50ae4..eed2b2fa01ad32 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes-i686.ll
@@ -359,8 +359,8 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
 ; CHECK-LABEL: test_store_sptr32_trunc_i1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; CHECK-NEXT:    andl $1, %ecx
+; CHECK-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; CHECK-NEXT:    andb $1, %cl
 ; CHECK-NEXT:    movb %cl, (%eax)
 ; CHECK-NEXT:    retl
 ;
@@ -368,8 +368,8 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; CHECK-O0-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; CHECK-O0-NEXT:    andl $1, %ecx
 ; CHECK-O0-NEXT:    # kill: def $cl killed $cl killed $ecx
+; CHECK-O0-NEXT:    andb $1, %cl
 ; CHECK-O0-NEXT:    movb %cl, (%eax)
 ; CHECK-O0-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
index 67539b07f5716c..dcbfde8285e5d9 100644
--- a/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
+++ b/llvm/test/CodeGen/X86/mixed-ptr-sizes.ll
@@ -281,15 +281,15 @@ define void @test_store_sptr32_trunc_i1(ptr addrspace(270) %s, i32 %i) {
 ; CHECK-LABEL: test_store_sptr32_trunc_i1:
 ; CHECK:       # %bb.0: # %entry
 ; CHECK-NEXT:    movslq %ecx, %rax
-; CHECK-NEXT:    andl $1, %edx
+; CHECK-NEXT:    andb $1, %dl
 ; CHECK-NEXT:    movb %dl, (%rax)
 ; CHECK-NEXT:    retq
 ;
 ; CHECK-O0-LABEL: test_store_sptr32_trunc_i1:
 ; CHECK-O0:       # %bb.0: # %entry
 ; CHECK-O0-NEXT:    movslq %ecx, %rax
-; CHECK-O0-NEXT:    andl $1, %edx
 ; CHECK-O0-NEXT:    movb %dl, %cl
+; CHECK-O0-NEXT:    andb $1, %cl
 ; CHECK-O0-NEXT:    movb %cl, (%rax)
 ; CHECK-O0-NEXT:    retq
 entry:
diff --git a/llvm/test/CodeGen/X86/or-with-overflow.ll b/llvm/test/CodeGen/X86/or-with-overflow.ll
index b3ffa209bc7004..c7b528a88ca0c2 100644
--- a/llvm/test/CodeGen/X86/or-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/or-with-overflow.ll
@@ -46,8 +46,8 @@ define i8 @or_i8_rr(i8 zeroext %0, i8 zeroext %1) {
 ;
 ; X64-LABEL: or_i8_rr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    orl %edi, %eax
+; X64-NEXT:    orb %dil, %sil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/parity-vec.ll b/llvm/test/CodeGen/X86/parity-vec.ll
index f9a2411465141c..cb525a680b1a7f 100644
--- a/llvm/test/CodeGen/X86/parity-vec.ll
+++ b/llvm/test/CodeGen/X86/parity-vec.ll
@@ -16,7 +16,7 @@ define i1 @noncanonical_parity(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %r = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
@@ -36,7 +36,7 @@ define i1 @canonical_parity(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %i1 = bitcast <16 x i1> %x to i16
@@ -65,7 +65,7 @@ define i1 @canonical_parity_noncanonical_pred(<16 x i1> %x) {
 ; NOPOPCNT-NEXT:    andl $3855, %ecx # imm = 0xF0F
 ; NOPOPCNT-NEXT:    movl %ecx, %eax
 ; NOPOPCNT-NEXT:    shrl $8, %eax
-; NOPOPCNT-NEXT:    addl %ecx, %eax
+; NOPOPCNT-NEXT:    addb %cl, %al
 ; NOPOPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; NOPOPCNT-NEXT:    retq
 ;
@@ -97,8 +97,8 @@ define i1 @noncanonical_nonparity(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
-; POPCNT-NEXT:    xorb $1, %al
+; POPCNT-NEXT:    notb %al
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %r.inv = call i1 @llvm.vector.reduce.xor.v16i1(<16 x i1> %x)
@@ -142,8 +142,8 @@ define i1 @canonical_nonparity_noncanonical_pred(<16 x i1> %x) {
 ; POPCNT-NEXT:    psllw $7, %xmm0
 ; POPCNT-NEXT:    pmovmskb %xmm0, %eax
 ; POPCNT-NEXT:    popcntl %eax, %eax
-; POPCNT-NEXT:    andl $1, %eax
-; POPCNT-NEXT:    xorb $1, %al
+; POPCNT-NEXT:    notb %al
+; POPCNT-NEXT:    andb $1, %al
 ; POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; POPCNT-NEXT:    retq
   %i1 = bitcast <16 x i1> %x to i16
diff --git a/llvm/test/CodeGen/X86/parity.ll b/llvm/test/CodeGen/X86/parity.ll
index 420f5ba5ab4336..2c3199db6ec65b 100644
--- a/llvm/test/CodeGen/X86/parity.ll
+++ b/llvm/test/CodeGen/X86/parity.ll
@@ -318,14 +318,14 @@ define i8 @parity_32_trunc(i32 %x) {
 ; X86-POPCNT-LABEL: parity_32_trunc:
 ; X86-POPCNT:       # %bb.0:
 ; X86-POPCNT-NEXT:    popcntl {{[0-9]+}}(%esp), %eax
-; X86-POPCNT-NEXT:    andl $1, %eax
+; X86-POPCNT-NEXT:    andb $1, %al
 ; X86-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X86-POPCNT-NEXT:    retl
 ;
 ; X64-POPCNT-LABEL: parity_32_trunc:
 ; X64-POPCNT:       # %bb.0:
 ; X64-POPCNT-NEXT:    popcntl %edi, %eax
-; X64-POPCNT-NEXT:    andl $1, %eax
+; X64-POPCNT-NEXT:    andb $1, %al
 ; X64-POPCNT-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-POPCNT-NEXT:    retq
   %1 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -381,9 +381,8 @@ define i16 @parity_16_mask255(i16 %x) {
 define i16 @parity_16_mask15(i16 %x) {
 ; X86-LABEL: parity_16_mask15:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %eax, %eax
-; X86-NEXT:    testb $15, %cl
+; X86-NEXT:    testb $15, {{[0-9]+}}(%esp)
 ; X86-NEXT:    setnp %al
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
diff --git a/llvm/test/CodeGen/X86/pr14088.ll b/llvm/test/CodeGen/X86/pr14088.ll
index 83bf13280f94ae..32a5155c41b907 100644
--- a/llvm/test/CodeGen/X86/pr14088.ll
+++ b/llvm/test/CodeGen/X86/pr14088.ll
@@ -29,10 +29,10 @@ define i32 @f(i1 %foo, ptr %tm_year2, ptr %bar, i16 %zed, i32 %zed2) {
 ; CHECK-NEXT:    movswq %ax, %rax
 ; CHECK-NEXT:    imulq $1717986919, %rax, %rax # imm = 0x66666667
 ; CHECK-NEXT:    movq %rax, %rcx
-; CHECK-NEXT:    shrq $63, %rcx
-; CHECK-NEXT:    shrq $34, %rax
-; CHECK-NEXT:    addl %ecx, %eax
-; CHECK-NEXT:    movb %al, (%rdx)
+; CHECK-NEXT:    shrq $34, %rcx
+; CHECK-NEXT:    shrq $63, %rax
+; CHECK-NEXT:    addb %al, %cl
+; CHECK-NEXT:    movb %cl, (%rdx)
 ; CHECK-NEXT:    xorl %eax, %eax
 ; CHECK-NEXT:  .LBB0_2: # %return
 ; CHECK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/pr15267.ll b/llvm/test/CodeGen/X86/pr15267.ll
index 5083eac71dce0c..89ef4cb2201e4b 100644
--- a/llvm/test/CodeGen/X86/pr15267.ll
+++ b/llvm/test/CodeGen/X86/pr15267.ll
@@ -84,62 +84,61 @@ define <16 x i4> @test4(ptr %in) nounwind {
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movq (%rdi), %rax
 ; CHECK-NEXT:    movl %eax, %ecx
-; CHECK-NEXT:    shrl $4, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    shrb $4, %cl
 ; CHECK-NEXT:    movl %eax, %edx
-; CHECK-NEXT:    andl $15, %edx
+; CHECK-NEXT:    andb $15, %dl
 ; CHECK-NEXT:    vmovd %edx, %xmm0
 ; CHECK-NEXT:    vpinsrb $1, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $8, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $2, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $12, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $3, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $16, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $4, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $20, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $5, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $24, %ecx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $6, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movl %eax, %ecx
 ; CHECK-NEXT:    shrl $28, %ecx
 ; CHECK-NEXT:    vpinsrb $7, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $32, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $8, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $36, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $9, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $40, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $10, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $44, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $11, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $48, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $12, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $52, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $13, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    movq %rax, %rcx
 ; CHECK-NEXT:    shrq $56, %rcx
-; CHECK-NEXT:    andl $15, %ecx
+; CHECK-NEXT:    andb $15, %cl
 ; CHECK-NEXT:    vpinsrb $14, %ecx, %xmm0, %xmm0
 ; CHECK-NEXT:    shrq $60, %rax
 ; CHECK-NEXT:    vpinsrb $15, %eax, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/pr35761.ll b/llvm/test/CodeGen/X86/pr35761.ll
index 5661b6775ab9d9..804366d93203b8 100644
--- a/llvm/test/CodeGen/X86/pr35761.ll
+++ b/llvm/test/CodeGen/X86/pr35761.ll
@@ -8,10 +8,10 @@
 define dso_local void @PR35761(i32 %call) {
 ; CHECK-LABEL: PR35761:
 ; CHECK:       # %bb.0: # %entry
-; CHECK-NEXT:    movzbl x(%rip), %eax
-; CHECK-NEXT:    andl $1, %eax
-; CHECK-NEXT:    movzbl y(%rip), %ecx
-; CHECK-NEXT:    xorl $255, %ecx
+; CHECK-NEXT:    movzbl y(%rip), %eax
+; CHECK-NEXT:    xorl $255, %eax
+; CHECK-NEXT:    movzbl x(%rip), %ecx
+; CHECK-NEXT:    andl $1, %ecx
 ; CHECK-NEXT:    orl %eax, %ecx
 ; CHECK-NEXT:    movw %cx, z(%rip)
 ; CHECK-NEXT:    movb $0, z+2(%rip)
diff --git a/llvm/test/CodeGen/X86/pr40539.ll b/llvm/test/CodeGen/X86/pr40539.ll
index 56d80a025fa084..a1df2d8f69238e 100644
--- a/llvm/test/CodeGen/X86/pr40539.ll
+++ b/llvm/test/CodeGen/X86/pr40539.ll
@@ -18,7 +18,7 @@ define zeroext i1 @_Z9test_log2v() {
 ; CHECK-NEXT:    xorps %xmm0, %xmm0
 ; CHECK-NEXT:    cmpeqss (%esp), %xmm0
 ; CHECK-NEXT:    movd %xmm0, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    popl %ecx
 ; CHECK-NEXT:    .cfi_def_cfa_offset 4
diff --git a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
index 65e679063ccb09..6edd9a36b6185e 100644
--- a/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
+++ b/llvm/test/CodeGen/X86/replace-load-and-with-bzhi.ll
@@ -15,7 +15,7 @@ define i32 @f32_bzhi(i32 %x, i32 %y) local_unnamed_addr {
 ;
 ; CHECK32-LABEL: f32_bzhi:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    retl
 entry:
@@ -34,7 +34,7 @@ define i32 @f32_bzhi_partial(i32 %x, i32 %y) local_unnamed_addr {
 ;
 ; CHECK32-LABEL: f32_bzhi_partial:
 ; CHECK32:       # %bb.0: # %entry
-; CHECK32-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; CHECK32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    bzhil %eax, {{[0-9]+}}(%esp), %eax
 ; CHECK32-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
index e7727a0ab6178c..f8a0698208ebda 100644
--- a/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
+++ b/llvm/test/CodeGen/X86/sdiv_fix_sat.ll
@@ -597,7 +597,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
@@ -648,7 +648,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
@@ -710,7 +710,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
@@ -760,7 +760,7 @@ define <4 x i32> @vec(<4 x i32> %x, <4 x i32> %y) nounwind {
 ; X64-NEXT:    subq $1, %rbp
 ; X64-NEXT:    sbbq $0, %r14
 ; X64-NEXT:    shrq $63, %rbx
-; X64-NEXT:    xorl %r15d, %ebx
+; X64-NEXT:    xorb %r15b, %bl
 ; X64-NEXT:    movq %r12, %rdi
 ; X64-NEXT:    movq %r13, %rsi
 ; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
diff --git a/llvm/test/CodeGen/X86/setcc-logic.ll b/llvm/test/CodeGen/X86/setcc-logic.ll
index c98aae7fbf4059..3d9105922e3165 100644
--- a/llvm/test/CodeGen/X86/setcc-logic.ll
+++ b/llvm/test/CodeGen/X86/setcc-logic.ll
@@ -454,9 +454,9 @@ define zeroext i1 @ne_neg1_and_ne_zero(i64 %x) nounwind {
 define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
 ; CHECK-LABEL: and_eq:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    xorl %ecx, %edx
-; CHECK-NEXT:    orb %dl, %dil
+; CHECK-NEXT:    xorb %sil, %dil
+; CHECK-NEXT:    xorb %cl, %dl
+; CHECK-NEXT:    orb %dil, %dl
 ; CHECK-NEXT:    sete %al
 ; CHECK-NEXT:    retq
   %cmp1 = icmp eq i8 %a, %b
@@ -468,9 +468,9 @@ define zeroext i1 @and_eq(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
 define zeroext i1 @or_ne(i8 %a, i8 %b, i8 %c, i8 %d) nounwind {
 ; CHECK-LABEL: or_ne:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    xorl %esi, %edi
-; CHECK-NEXT:    xorl %ecx, %edx
-; CHECK-NEXT:    orb %dl, %dil
+; CHECK-NEXT:    xorb %sil, %dil
+; CHECK-NEXT:    xorb %cl, %dl
+; CHECK-NEXT:    orb %dil, %dl
 ; CHECK-NEXT:    setne %al
 ; CHECK-NEXT:    retq
   %cmp1 = icmp ne i8 %a, %b
diff --git a/llvm/test/CodeGen/X86/setoeq.ll b/llvm/test/CodeGen/X86/setoeq.ll
index f0addf4b64599d..d30e9395af0458 100644
--- a/llvm/test/CodeGen/X86/setoeq.ll
+++ b/llvm/test/CodeGen/X86/setoeq.ll
@@ -9,7 +9,7 @@ define zeroext i8 @t(double %x) nounwind readnone {
 ; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; CHECK-NEXT:    cmpeqsd %xmm0, %xmm1
 ; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retl
 entry:
@@ -28,7 +28,7 @@ define zeroext i8 @u(double %x) nounwind readnone {
 ; CHECK-NEXT:    cvtdq2pd %xmm1, %xmm1
 ; CHECK-NEXT:    cmpneqsd %xmm0, %xmm1
 ; CHECK-NEXT:    movd %xmm1, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retl
 entry:
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 220c2e5012ea49..6fe45b37f030e8 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2476,14 +2476,14 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
 ; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
+; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
+; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm3, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm1, %ymm1
 ; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
-; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm3
-; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm0
-; CHECK-AVX512VL-NEXT:    vpternlogq $14, %ymm3, %ymm2, %ymm0
+; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm0
+; CHECK-AVX512VL-NEXT:    vpternlogq $50, %ymm2, %ymm1, %ymm0
 ; CHECK-AVX512VL-NEXT:    retq
   %rem = srem <32 x i8> %x, <i8 13, i8 5, i8 19, i8 34, i8 2, i8 8, i8 2, i8 88, i8 62, i8 62, i8 5, i8 7, i8 97, i8 2, i8 3, i8 60, i8 3, i8 87, i8 7, i8 6, i8 84, i8 -128, i8 127, i8 56, i8 114, i8 1, i8 50, i8 7, i8 2, i8 8, i8 97, i8 117>
   %cmp = icmp ne <32 x i8> %rem, zeroinitializer
diff --git a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
index f7e053d384c99d..f624122be8374e 100644
--- a/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
+++ b/llvm/test/CodeGen/X86/statepoint-spill-slot-size-promotion.ll
@@ -27,7 +27,7 @@ define i1 @test_spill_slot_size(i1 %a1, i2 %a2, i7 %a7, i8 %a8, i9 %a9, i15 %a15
 ; CHECK-NEXT:    andb $3, %sil
 ; CHECK-NEXT:    movb %sil, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    movl %ebx, %eax
-; CHECK-NEXT:    andl $1, %eax
+; CHECK-NEXT:    andb $1, %al
 ; CHECK-NEXT:    movb %al, {{[0-9]+}}(%rsp)
 ; CHECK-NEXT:    andb $127, %dl
 ; CHECK-NEXT:    movb %dl, {{[0-9]+}}(%rsp)
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
index 9a8719f9a64fa5..cecc3efd86282f 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-innerouter.ll
@@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $60, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $60, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $60, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $60, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
index 2ea74f39423872..074b6f8591d1e1 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-interleavedbytehalves.ll
@@ -114,19 +114,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $15, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $15, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
index eb6accd3e623b0..a9cf67789288c2 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll
@@ -104,19 +104,19 @@ define i64 @out64_constmask(i64 %x, i64 %y) {
 define i8 @in8_constmask(i8 %x, i8 %y) {
 ; CHECK-NOBMI-LABEL: in8_constmask:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %esi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %edi
-; CHECK-NOBMI-NEXT:    andb $15, %dil
-; CHECK-NOBMI-NEXT:    xorb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb $15, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8_constmask:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    movl %esi, %eax
-; CHECK-BMI-NEXT:    xorl %esi, %edi
-; CHECK-BMI-NEXT:    andb $15, %dil
-; CHECK-BMI-NEXT:    xorb %dil, %al
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb $15, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
index b1194bedc4e1ca..27fa5e68f13052 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-vector-variablemask.ll
@@ -16,11 +16,10 @@
 define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: out_v1i8:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    movl %edx, %eax
-; CHECK-NEXT:    andl %edx, %edi
-; CHECK-NEXT:    notb %al
-; CHECK-NEXT:    andb %sil, %al
-; CHECK-NEXT:    orb %dil, %al
+; CHECK-NEXT:    movl %edi, %eax
+; CHECK-NEXT:    xorb %sil, %al
+; CHECK-NEXT:    andb %dl, %al
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %mx = and <1 x i8> %x, %mask
@@ -37,32 +36,28 @@ define <1 x i8> @out_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 define <2 x i8> @out_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: out_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
-; CHECK-BASELINE-NEXT:    movl %r8d, %eax
-; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %edi
-; CHECK-BASELINE-NEXT:    notb %al
-; CHECK-BASELINE-NEXT:    notb %r9b
-; CHECK-BASELINE-NEXT:    andb %cl, %r9b
-; CHECK-BASELINE-NEXT:    andb %dl, %al
-; CHECK-BASELINE-NEXT:    orb %dil, %al
-; CHECK-BASELINE-NEXT:    orb %sil, %r9b
+; CHECK-BASELINE-NEXT:    movl %edi, %eax
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    andb %r8b, %al
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
+; CHECK-BASELINE-NEXT:    andb %r9b, %sil
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-BASELINE-NEXT:    movl %r9d, %edx
+; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
 ;
 ; CHECK-SSE1-LABEL: out_v2i8:
 ; CHECK-SSE1:       # %bb.0:
-; CHECK-SSE1-NEXT:    movl %r8d, %eax
-; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %edi
-; CHECK-SSE1-NEXT:    notb %al
-; CHECK-SSE1-NEXT:    notb %r9b
-; CHECK-SSE1-NEXT:    andb %cl, %r9b
-; CHECK-SSE1-NEXT:    andb %dl, %al
-; CHECK-SSE1-NEXT:    orb %dil, %al
-; CHECK-SSE1-NEXT:    orb %sil, %r9b
+; CHECK-SSE1-NEXT:    movl %edi, %eax
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    andb %r8b, %al
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
+; CHECK-SSE1-NEXT:    andb %r9b, %sil
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
-; CHECK-SSE1-NEXT:    movl %r9d, %edx
+; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
 ;
 ; CHECK-SSE2-LABEL: out_v2i8:
@@ -111,7 +106,7 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
@@ -135,7 +130,7 @@ define <4 x i8> @out_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r11b, %dl
@@ -178,7 +173,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r10b, %dl
@@ -199,7 +194,7 @@ define <4 x i8> @out_v4i8_undef(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %cl
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    andb {{[0-9]+}}(%rsp), %sil
 ; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r10b, %dl
@@ -2277,9 +2272,9 @@ define <1 x i8> @in_v1i8(<1 x i8> %x, <1 x i8> %y, <1 x i8> %mask) nounwind {
 ; CHECK-LABEL: in_v1i8:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
-; CHECK-NEXT:    andl %edx, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
+; CHECK-NEXT:    andb %dl, %al
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
   %n0 = xor <1 x i8> %x, %y
@@ -2296,12 +2291,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-BASELINE-LABEL: in_v2i8:
 ; CHECK-BASELINE:       # %bb.0:
 ; CHECK-BASELINE-NEXT:    movl %edi, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
-; CHECK-BASELINE-NEXT:    andl %r9d, %esi
-; CHECK-BASELINE-NEXT:    andl %r8d, %eax
-; CHECK-BASELINE-NEXT:    xorl %edx, %eax
-; CHECK-BASELINE-NEXT:    xorl %ecx, %esi
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
+; CHECK-BASELINE-NEXT:    andb %r9b, %sil
+; CHECK-BASELINE-NEXT:    andb %r8b, %al
+; CHECK-BASELINE-NEXT:    xorb %dl, %al
+; CHECK-BASELINE-NEXT:    xorb %cl, %sil
 ; CHECK-BASELINE-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BASELINE-NEXT:    movl %esi, %edx
 ; CHECK-BASELINE-NEXT:    retq
@@ -2309,12 +2304,12 @@ define <2 x i8> @in_v2i8(<2 x i8> %x, <2 x i8> %y, <2 x i8> %mask) nounwind {
 ; CHECK-SSE1-LABEL: in_v2i8:
 ; CHECK-SSE1:       # %bb.0:
 ; CHECK-SSE1-NEXT:    movl %edi, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
-; CHECK-SSE1-NEXT:    andl %r9d, %esi
-; CHECK-SSE1-NEXT:    andl %r8d, %eax
-; CHECK-SSE1-NEXT:    xorl %edx, %eax
-; CHECK-SSE1-NEXT:    xorl %ecx, %esi
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
+; CHECK-SSE1-NEXT:    andb %r9b, %sil
+; CHECK-SSE1-NEXT:    andb %r8b, %al
+; CHECK-SSE1-NEXT:    xorb %dl, %al
+; CHECK-SSE1-NEXT:    xorb %cl, %sil
 ; CHECK-SSE1-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-SSE1-NEXT:    movl %esi, %edx
 ; CHECK-SSE1-NEXT:    retq
@@ -2362,7 +2357,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-BASELINE-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-BASELINE-NEXT:    xorl %r9d, %esi
+; CHECK-BASELINE-NEXT:    xorb %r9b, %sil
 ; CHECK-BASELINE-NEXT:    xorb %r11b, %dl
 ; CHECK-BASELINE-NEXT:    xorb %r10b, %cl
 ; CHECK-BASELINE-NEXT:    xorb %dil, %r8b
@@ -2386,7 +2381,7 @@ define <4 x i8> @in_v4i8(<4 x i8> %x, <4 x i8> %y, <4 x i8> %mask) nounwind {
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %edi
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r10d
 ; CHECK-SSE1-NEXT:    movzbl {{[0-9]+}}(%rsp), %r11d
-; CHECK-SSE1-NEXT:    xorl %r9d, %esi
+; CHECK-SSE1-NEXT:    xorb %r9b, %sil
 ; CHECK-SSE1-NEXT:    xorb %r11b, %dl
 ; CHECK-SSE1-NEXT:    xorb %r10b, %cl
 ; CHECK-SSE1-NEXT:    xorb %dil, %r8b
diff --git a/llvm/test/CodeGen/X86/vector-compare-all_of.ll b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
index 30202701fdb8c8..f70f586d0ecdb1 100644
--- a/llvm/test/CodeGen/X86/vector-compare-all_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-all_of.ll
@@ -1355,10 +1355,8 @@ define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; SSE2-NEXT:    pxor %xmm4, %xmm0
 ; SSE2-NEXT:    pcmpgtd %xmm2, %xmm0
 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
-; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    notl %eax
-; SSE2-NEXT:    cmpb $-1, %al
+; SSE2-NEXT:    testl %eax, %eax
 ; SSE2-NEXT:    sete %al
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-compare-any_of.ll b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
index 2df39d69dbb751..8f441fee670841 100644
--- a/llvm/test/CodeGen/X86/vector-compare-any_of.ll
+++ b/llvm/test/CodeGen/X86/vector-compare-any_of.ll
@@ -1195,7 +1195,7 @@ define i1 @bool_reduction_v8i32(<8 x i32> %x, <8 x i32> %y) {
 ; SSE2-NEXT:    packssdw %xmm1, %xmm0
 ; SSE2-NEXT:    packsswb %xmm0, %xmm0
 ; SSE2-NEXT:    pmovmskb %xmm0, %eax
-; SSE2-NEXT:    xorb $-1, %al
+; SSE2-NEXT:    cmpb $-1, %al
 ; SSE2-NEXT:    setne %al
 ; SSE2-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
index 9cd0f4d12e15ab..4d0d0548d44f90 100644
--- a/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
+++ b/llvm/test/CodeGen/X86/vector-reduce-or-cmp.ll
@@ -1068,10 +1068,10 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE2-NEXT:    movd %xmm1, %eax
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[1,1,1,1]
 ; SSE2-NEXT:    movd %xmm0, %ecx
-; SSE2-NEXT:    orl %eax, %ecx
+; SSE2-NEXT:    orb %al, %cl
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm0 = xmm1[2,3,2,3]
 ; SSE2-NEXT:    movd %xmm0, %eax
-; SSE2-NEXT:    orl %ecx, %eax
+; SSE2-NEXT:    orb %cl, %al
 ; SSE2-NEXT:    testb $1, %al
 ; SSE2-NEXT:    je .LBB27_2
 ; SSE2-NEXT:  # %bb.1:
@@ -1088,9 +1088,9 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; SSE41-NEXT:    pxor %xmm0, %xmm1
 ; SSE41-NEXT:    pextrd $1, %xmm1, %eax
 ; SSE41-NEXT:    movd %xmm1, %ecx
-; SSE41-NEXT:    orl %eax, %ecx
+; SSE41-NEXT:    orb %al, %cl
 ; SSE41-NEXT:    pextrd $2, %xmm1, %eax
-; SSE41-NEXT:    orl %ecx, %eax
+; SSE41-NEXT:    orb %cl, %al
 ; SSE41-NEXT:    testb $1, %al
 ; SSE41-NEXT:    je .LBB27_2
 ; SSE41-NEXT:  # %bb.1:
@@ -1107,9 +1107,9 @@ define i32 @mask_v3i1(<3 x i32> %a, <3 x i32> %b) {
 ; AVX1OR2-NEXT:    vpxor %xmm1, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vpextrd $1, %xmm0, %eax
 ; AVX1OR2-NEXT:    vmovd %xmm0, %ecx
-; AVX1OR2-NEXT:    orl %eax, %ecx
+; AVX1OR2-NEXT:    orb %al, %cl
 ; AVX1OR2-NEXT:    vpextrd $2, %xmm0, %eax
-; AVX1OR2-NEXT:    orl %ecx, %eax
+; AVX1OR2-NEXT:    orb %cl, %al
 ; AVX1OR2-NEXT:    testb $1, %al
 ; AVX1OR2-NEXT:    je .LBB27_2
 ; AVX1OR2-NEXT:  # %bb.1:
diff --git a/llvm/test/CodeGen/X86/xor-with-overflow.ll b/llvm/test/CodeGen/X86/xor-with-overflow.ll
index 5d22302d39add9..e025989156e9e3 100644
--- a/llvm/test/CodeGen/X86/xor-with-overflow.ll
+++ b/llvm/test/CodeGen/X86/xor-with-overflow.ll
@@ -46,8 +46,8 @@ define i8 @xor_i8_rr(i8 zeroext %0, i8 zeroext %1) {
 ;
 ; X64-LABEL: xor_i8_rr:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %eax
-; X64-NEXT:    xorl %edi, %eax
+; X64-NEXT:    xorb %dil, %sil
+; X64-NEXT:    movzbl %sil, %eax
 ; X64-NEXT:    cmovel %edi, %eax
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
index 39e2d6f1acca45..9170d8f7b45de9 100644
--- a/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/ivchain-X86.ll
@@ -312,22 +312,18 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou
 ; X64-NEXT:    .p2align 4, 0x90
 ; X64-NEXT:  .LBB3_1: # %for.body
 ; X64-NEXT:    # =>This Inner Loop Header: Depth=1
-; X64-NEXT:    movzbl -3(%rdi,%rax), %ecx
-; X64-NEXT:    movzbl -3(%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, -3(%rdx,%rax)
-; X64-NEXT:    movzbl -2(%rdi,%rax), %ecx
-; X64-NEXT:    movzbl -2(%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, -2(%rdx,%rax)
-; X64-NEXT:    movzbl -1(%rdi,%rax), %ecx
-; X64-NEXT:    movzbl -1(%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, -1(%rdx,%rax)
-; X64-NEXT:    movzbl (%rdi,%rax), %ecx
-; X64-NEXT:    movzbl (%rsi,%rax), %r8d
-; X64-NEXT:    addl %ecx, %r8d
-; X64-NEXT:    movb %r8b, (%rdx,%rax)
+; X64-NEXT:    movzbl -3(%rsi,%rax), %ecx
+; X64-NEXT:    addb -3(%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, -3(%rdx,%rax)
+; X64-NEXT:    movzbl -2(%rsi,%rax), %ecx
+; X64-NEXT:    addb -2(%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, -2(%rdx,%rax)
+; X64-NEXT:    movzbl -1(%rsi,%rax), %ecx
+; X64-NEXT:    addb -1(%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, -1(%rdx,%rax)
+; X64-NEXT:    movzbl (%rsi,%rax), %ecx
+; X64-NEXT:    addb (%rdi,%rax), %cl
+; X64-NEXT:    movb %cl, (%rdx,%rax)
 ; X64-NEXT:    addq $4, %rax
 ; X64-NEXT:    cmpl $403, %eax # imm = 0x193
 ; X64-NEXT:    jne .LBB3_1
@@ -337,7 +333,6 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou
 ; X32-LABEL: foldedidx:
 ; X32:       # %bb.0: # %entry
 ; X32-NEXT:    pushl %ebx
-; X32-NEXT:    pushl %edi
 ; X32-NEXT:    pushl %esi
 ; X32-NEXT:    movl $3, %eax
 ; X32-NEXT:    movl {{[0-9]+}}(%esp), %ecx
@@ -346,28 +341,23 @@ define void @foldedidx(ptr nocapture %a, ptr nocapture %b, ptr nocapture %c) nou
 ; X32-NEXT:    .p2align 4, 0x90
 ; X32-NEXT:  .LBB3_1: # %for.body
 ; X32-NEXT:    # =>This Inner Loop Header: Depth=1
-; X32-NEXT:    movzbl -3(%esi,%eax), %edi
 ; X32-NEXT:    movzbl -3(%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb -3(%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, -3(%ecx,%eax)
-; X32-NEXT:    movzbl -2(%esi,%eax), %edi
 ; X32-NEXT:    movzbl -2(%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb -2(%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, -2(%ecx,%eax)
-; X32-NEXT:    movzbl -1(%esi,%eax), %edi
 ; X32-NEXT:    movzbl -1(%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb -1(%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, -1(%ecx,%eax)
-; X32-NEXT:    movzbl (%esi,%eax), %edi
 ; X32-NEXT:    movzbl (%edx,%eax), %ebx
-; X32-NEXT:    addl %edi, %ebx
+; X32-NEXT:    addb (%esi,%eax), %bl
 ; X32-NEXT:    movb %bl, (%ecx,%eax)
 ; X32-NEXT:    addl $4, %eax
 ; X32-NEXT:    cmpl $403, %eax # imm = 0x193
 ; X32-NEXT:    jne .LBB3_1
 ; X32-NEXT:  # %bb.2: # %for.end
 ; X32-NEXT:    popl %esi
-; X32-NEXT:    popl %edi
 ; X32-NEXT:    popl %ebx
 ; X32-NEXT:    retl
 entry:

>From 627f4c6e6a9e4435b4ea3edbfac2e67aa543c0a0 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 20:05:19 -0700
Subject: [PATCH 35/56] Update even more tests

---
 llvm/test/CodeGen/X86/ctpop-mask.ll       |  16 +-
 llvm/test/CodeGen/X86/fpenv.ll            |  13 +-
 llvm/test/CodeGen/X86/funnel-shift.ll     |  26 +-
 llvm/test/CodeGen/X86/shift-amount-mod.ll | 172 ++++++------
 llvm/test/CodeGen/X86/shift-i128.ll       | 310 ++++++++++++----------
 5 files changed, 283 insertions(+), 254 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index a43dba94d30c78..cbb0ae719e5aa3 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -148,10 +148,9 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone {
 ;
 ; X86-NO-POPCOUNT-LABEL: ctpop_shifted_mask3:
 ; X86-NO-POPCOUNT:       # %bb.0:
-; X86-NO-POPCOUNT-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NO-POPCOUNT-NEXT:    andl $14, %ecx
+; X86-NO-POPCOUNT-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NO-POPCOUNT-NEXT:    andb $14, %cl
 ; X86-NO-POPCOUNT-NEXT:    movl $59796, %eax # imm = 0xE994
-; X86-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-POPCOUNT-NEXT:    shrl %cl, %eax
 ; X86-NO-POPCOUNT-NEXT:    andl $3, %eax
 ; X86-NO-POPCOUNT-NEXT:    # kill: def $ax killed $ax killed $eax
@@ -160,7 +159,7 @@ define i16 @ctpop_shifted_mask3(i16 %x) nounwind readnone {
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask3:
 ; X64-NO-POPCOUNT:       # %bb.0:
 ; X64-NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; X64-NO-POPCOUNT-NEXT:    andl $14, %ecx
+; X64-NO-POPCOUNT-NEXT:    andb $14, %cl
 ; X64-NO-POPCOUNT-NEXT:    movl $59796, %eax # imm = 0xE994
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-POPCOUNT-NEXT:    shrl %cl, %eax
@@ -201,7 +200,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    andl $15, %edi
+; X64-NO-POPCOUNT-NEXT:    andb $15, %dil
 ; X64-NO-POPCOUNT-NEXT:    leal (,%rdi,4), %ecx
 ; X64-NO-POPCOUNT-NEXT:    movabsq $4841987667533046032, %rax # imm = 0x4332322132212110
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -241,9 +240,10 @@ define i32 @ctpop_shifted_mask4(i32 %x) nounwind readnone {
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    movl %edi, %ecx
-; X64-NO-POPCOUNT-NEXT:    shrl $7, %ecx
-; X64-NO-POPCOUNT-NEXT:    andl $60, %ecx
+; X64-NO-POPCOUNT-NEXT:    # kill: def $edi killed $edi def $rdi
+; X64-NO-POPCOUNT-NEXT:    shrl $9, %edi
+; X64-NO-POPCOUNT-NEXT:    andb $15, %dil
+; X64-NO-POPCOUNT-NEXT:    leal (,%rdi,4), %ecx
 ; X64-NO-POPCOUNT-NEXT:    movabsq $4841987667533046032, %rax # imm = 0x4332322132212110
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-POPCOUNT-NEXT:    shrq %cl, %rax
diff --git a/llvm/test/CodeGen/X86/fpenv.ll b/llvm/test/CodeGen/X86/fpenv.ll
index c79e19f07cda54..42a753cc4ee4c3 100644
--- a/llvm/test/CodeGen/X86/fpenv.ll
+++ b/llvm/test/CodeGen/X86/fpenv.ll
@@ -185,8 +185,9 @@ define void @func_05(i32 %x) nounwind {
 ; X86-NOSSE-LABEL: func_05:
 ; X86-NOSSE:       # %bb.0:
 ; X86-NOSSE-NEXT:    pushl %eax
-; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NOSSE-NEXT:    leal 4(%eax,%eax), %ecx
+; X86-NOSSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NOSSE-NEXT:    addl %ecx, %ecx
+; X86-NOSSE-NEXT:    addb $4, %cl
 ; X86-NOSSE-NEXT:    movl $201, %eax
 ; X86-NOSSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NOSSE-NEXT:    shll %cl, %eax
@@ -203,8 +204,9 @@ define void @func_05(i32 %x) nounwind {
 ; X86-SSE-LABEL: func_05:
 ; X86-SSE:       # %bb.0:
 ; X86-SSE-NEXT:    pushl %eax
-; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE-NEXT:    leal 4(%eax,%eax), %ecx
+; X86-SSE-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SSE-NEXT:    addl %ecx, %ecx
+; X86-SSE-NEXT:    addb $4, %cl
 ; X86-SSE-NEXT:    movl $201, %eax
 ; X86-SSE-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-SSE-NEXT:    shll %cl, %eax
@@ -227,7 +229,8 @@ define void @func_05(i32 %x) nounwind {
 ; X64-LABEL: func_05:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal 4(%rdi,%rdi), %ecx
+; X64-NEXT:    leal (%rdi,%rdi), %ecx
+; X64-NEXT:    addb $4, %cl
 ; X64-NEXT:    movl $201, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shll %cl, %eax
diff --git a/llvm/test/CodeGen/X86/funnel-shift.ll b/llvm/test/CodeGen/X86/funnel-shift.ll
index a464d78f9af38c..c9e9bda1dcd3b4 100644
--- a/llvm/test/CodeGen/X86/funnel-shift.ll
+++ b/llvm/test/CodeGen/X86/funnel-shift.ll
@@ -182,8 +182,9 @@ define i37 @fshl_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
 ; X64-AVX-NEXT:    mulq %rdx
 ; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX-NEXT:    subl %eax, %ecx
+; X64-AVX-NEXT:    shll $2, %eax
+; X64-AVX-NEXT:    addb %dl, %al
+; X64-AVX-NEXT:    subb %al, %cl
 ; X64-AVX-NEXT:    shlq $27, %rsi
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-AVX-NEXT:    shldq %cl, %rsi, %rdi
@@ -349,9 +350,10 @@ define i37 @fshr_i37(i37 %x, i37 %y, i37 %z) nounwind {
 ; X64-AVX-NEXT:    movabsq $498560650640798693, %rdx # imm = 0x6EB3E45306EB3E5
 ; X64-AVX-NEXT:    mulq %rdx
 ; X64-AVX-NEXT:    leal (%rdx,%rdx,8), %eax
-; X64-AVX-NEXT:    leal (%rdx,%rax,4), %eax
-; X64-AVX-NEXT:    subl %eax, %ecx
-; X64-AVX-NEXT:    addl $27, %ecx
+; X64-AVX-NEXT:    shll $2, %eax
+; X64-AVX-NEXT:    addb %dl, %al
+; X64-AVX-NEXT:    subb %al, %cl
+; X64-AVX-NEXT:    addb $27, %cl
 ; X64-AVX-NEXT:    shlq $27, %rsi
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-AVX-NEXT:    shrdq %cl, %rdi, %rsi
@@ -437,16 +439,15 @@ define i32 @fshl_i32_undef0_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshl_i32_undef0_msk:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    andl $7, %ecx
-; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andb $7, %cl
 ; X86-SSE2-NEXT:    shldl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-AVX-LABEL: fshl_i32_undef0_msk:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movl %esi, %ecx
-; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    andb $7, %cl
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-AVX-NEXT:    shldl %cl, %edi, %eax
 ; X64-AVX-NEXT:    retq
@@ -694,16 +695,15 @@ define i32 @fshr_i32_undef1_msk(i32 %a0, i32 %a1) nounwind {
 ; X86-SSE2-LABEL: fshr_i32_undef1_msk:
 ; X86-SSE2:       # %bb.0:
 ; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SSE2-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SSE2-NEXT:    andl $7, %ecx
-; X86-SSE2-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-SSE2-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-SSE2-NEXT:    andb $7, %cl
 ; X86-SSE2-NEXT:    shrdl %cl, %eax, %eax
 ; X86-SSE2-NEXT:    retl
 ;
 ; X64-AVX-LABEL: fshr_i32_undef1_msk:
 ; X64-AVX:       # %bb.0:
 ; X64-AVX-NEXT:    movl %esi, %ecx
-; X64-AVX-NEXT:    andl $7, %ecx
+; X64-AVX-NEXT:    andb $7, %cl
 ; X64-AVX-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-AVX-NEXT:    shrdl %cl, %edi, %eax
 ; X64-AVX-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/shift-amount-mod.ll b/llvm/test/CodeGen/X86/shift-amount-mod.ll
index 9f7ac748c47e16..c212e028b6c90e 100644
--- a/llvm/test/CodeGen/X86/shift-amount-mod.ll
+++ b/llvm/test/CodeGen/X86/shift-amount-mod.ll
@@ -735,10 +735,9 @@ define i32 @reg32_lshr_by_sub_from_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_sub_from_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    negb %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -763,8 +762,8 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
@@ -782,7 +781,7 @@ define i64 @reg64_lshr_by_sub_from_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X64-LABEL: reg64_lshr_by_sub_from_negated:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
@@ -800,9 +799,8 @@ define i32 @reg32_lshr_by_sub_of_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_sub_of_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -826,8 +824,8 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $-64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -844,7 +842,7 @@ define i64 @reg64_lshr_by_sub_of_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X64-LABEL: reg64_lshr_by_sub_of_negated:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -862,9 +860,8 @@ define i32 @reg32_lshr_by_add_to_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X86-LABEL: reg32_lshr_by_add_to_negated:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -872,7 +869,7 @@ define i32 @reg32_lshr_by_add_to_negated(i32 %val, i32 %a, i32 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -887,8 +884,8 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -906,7 +903,7 @@ define i64 @reg64_lshr_by_add_to_negated(i64 %val, i64 %a, i64 %b) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -923,9 +920,8 @@ define i32 @reg32_lshr_by_sub_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind
 ; X86-LABEL: reg32_lshr_by_sub_of_negated_amts:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -933,7 +929,7 @@ define i32 @reg32_lshr_by_sub_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -949,8 +945,8 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -967,7 +963,7 @@ define i64 @reg64_lshr_by_sub_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -985,10 +981,9 @@ define i32 @reg32_lshr_by_add_of_negated_amts(i32 %val, i32 %a, i32 %b) nounwind
 ; X86-LABEL: reg32_lshr_by_add_of_negated_amts:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    negb %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1014,8 +1009,8 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movb $-128, %cl
 ; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
@@ -1033,7 +1028,7 @@ define i64 @reg64_lshr_by_add_of_negated_amts(i64 %val, i64 %a, i64 %b) nounwind
 ; X64-LABEL: reg64_lshr_by_add_of_negated_amts:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
@@ -1109,10 +1104,9 @@ define i32 @reg32_lshr_by_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b) nounw
 ; X86-LABEL: reg32_lshr_by_negated_unfolded_sub_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    negb %cl
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1138,8 +1132,8 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %dl
 ; X86-NEXT:    movb $64, %cl
 ; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
@@ -1157,7 +1151,7 @@ define i64 @reg64_lshr_by_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b) nounw
 ; X64-LABEL: reg64_lshr_by_negated_unfolded_sub_b:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    negb %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
@@ -1173,9 +1167,8 @@ define i32 @reg32_lshr_by_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b) nounw
 ; X86-LABEL: reg32_lshr_by_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1200,8 +1193,8 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $-64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -1218,7 +1211,7 @@ define i64 @reg64_lshr_by_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b) nounw
 ; X64-LABEL: reg64_lshr_by_b_sub_negated_unfolded:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1233,9 +1226,8 @@ define i32 @reg32_lshr_by_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounw
 ; X86-LABEL: reg32_lshr_by_negated_unfolded_add_b:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1243,7 +1235,7 @@ define i32 @reg32_lshr_by_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b) nounw
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -1259,8 +1251,8 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    addb $64, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
@@ -1278,7 +1270,7 @@ define i64 @reg64_lshr_by_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b) nounw
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1355,9 +1347,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $31, %cl
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1366,9 +1358,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_sub_b(i32 %val, i32 %a, i32 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %esi, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl $31, %ecx
-; X64-NEXT:    subl %edx, %ecx
+; X64-NEXT:    negb %cl
+; X64-NEXT:    andb $31, %cl
+; X64-NEXT:    subb %dl, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -1385,9 +1377,10 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    andb $63, %cl
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1404,9 +1397,9 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_sub_b(i64 %val, i64 %a, i64 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rsi, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negl %ecx
-; X64-NEXT:    andl $63, %ecx
-; X64-NEXT:    subl %edx, %ecx
+; X64-NEXT:    negb %cl
+; X64-NEXT:    andb $63, %cl
+; X64-NEXT:    subb %dl, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1421,12 +1414,11 @@ define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b
 ; X86-LABEL: reg32_lshr_by_masked_b_sub_negated_unfolded:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $31, %edx
-; X86-NEXT:    subl %edx, %ecx
-; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %dl
+; X86-NEXT:    andb $31, %dl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
 ;
@@ -1434,9 +1426,9 @@ define i32 @reg32_lshr_by_masked_b_sub_negated_unfolded(i32 %val, i32 %a, i32 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edx, %ecx
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $31, %esi
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $31, %sil
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
 ; X64-NEXT:    retq
@@ -1452,11 +1444,12 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    xorl %edx, %edx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    andl $63, %edx
-; X86-NEXT:    subl %edx, %ecx
+; X86-NEXT:    movb {{[0-9]+}}(%esp), %ch
+; X86-NEXT:    subb %ch, %dl
+; X86-NEXT:    andb $63, %dl
+; X86-NEXT:    subb %dl, %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1473,9 +1466,9 @@ define i64 @reg64_lshr_by_masked_b_sub_negated_unfolded(i64 %val, i64 %a, i64 %b
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    subl %esi, %ecx
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $63, %sil
+; X64-NEXT:    subb %sil, %cl
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
@@ -1491,9 +1484,9 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $31, %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    subb {{[0-9]+}}(%esp), %cl
+; X86-NEXT:    andb $31, %cl
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NEXT:    shrl %cl, %eax
 ; X86-NEXT:    retl
@@ -1503,8 +1496,8 @@ define i32 @reg32_lshr_by_masked_negated_unfolded_add_b(i32 %val, i32 %a, i32 %b
 ; X64-NEXT:    # kill: def $edx killed $edx def $rdx
 ; X64-NEXT:    # kill: def $esi killed $esi def $rsi
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $31, %esi
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $31, %sil
 ; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrl %cl, %eax
@@ -1522,9 +1515,10 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    xorl %ecx, %ecx
-; X86-NEXT:    subl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $63, %ecx
-; X86-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    subb %dl, %cl
+; X86-NEXT:    andb $63, %cl
+; X86-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shrl %cl, %edx
 ; X86-NEXT:    shrdl %cl, %esi, %eax
@@ -1540,9 +1534,9 @@ define i64 @reg64_lshr_by_masked_negated_unfolded_add_b(i64 %val, i64 %a, i64 %b
 ; X64-LABEL: reg64_lshr_by_masked_negated_unfolded_add_b:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    negl %esi
-; X64-NEXT:    andl $63, %esi
-; X64-NEXT:    leal (%rdx,%rsi), %ecx
+; X64-NEXT:    negb %sil
+; X64-NEXT:    andb $63, %sil
+; X64-NEXT:    leal (%rsi,%rdx), %ecx
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    shrq %cl, %rax
 ; X64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index 767bd772ab7a3e..e5a158caf59b19 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -253,13 +253,21 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $112, %esp
-; i686-NEXT:    movl 40(%ebp), %edx
-; i686-NEXT:    movl 24(%ebp), %eax
-; i686-NEXT:    movl 28(%ebp), %ecx
-; i686-NEXT:    movl 32(%ebp), %esi
-; i686-NEXT:    movl 20(%ebp), %edi
+; i686-NEXT:    subl $92, %esp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl 16(%ebp), %edi
 ; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
@@ -276,66 +284,74 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, %ebx
-; i686-NEXT:    andl $31, %ebx
-; i686-NEXT:    shrl $3, %edx
-; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 40(%esp,%edx), %eax
-; i686-NEXT:    movl 36(%esp,%edx), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrdl %cl, %eax, %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 32(%esp,%edx), %ecx
+; i686-NEXT:    andl $7, %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 44(%esp,%edx), %edx
+; i686-NEXT:    shrl $3, %ebx
+; i686-NEXT:    andl $15, %ebx
+; i686-NEXT:    movl 32(%esp,%ebx), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 36(%esp,%ebx), %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl %ebx, %esi
-; i686-NEXT:    shrdl %cl, %edx, %eax
+; i686-NEXT:    addl %edx, %edx
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 28(%esp,%ebx), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 56(%ebp), %edx
-; i686-NEXT:    movl %edx, %eax
-; i686-NEXT:    andl $31, %eax
-; i686-NEXT:    shrl $3, %edx
-; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 72(%esp,%edx), %ebx
-; i686-NEXT:    movl 68(%esp,%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %ebx, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 64(%esp,%edx), %edi
-; i686-NEXT:    movl 76(%esp,%edx), %edx
-; i686-NEXT:    shrdl %cl, %edx, %ebx
-; i686-NEXT:    movl %esi, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shrl %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    shrl %cl, %edx
-; i686-NEXT:    movl 72(%ebp), %eax
-; i686-NEXT:    movl %edx, 28(%eax)
-; i686-NEXT:    movl %ebx, 24(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
-; i686-NEXT:    movl %edi, 16(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 12(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 8(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    movl %esi, %edx
+; i686-NEXT:    andl $7, %edx
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $15, %esi
+; i686-NEXT:    movl 64(%esp,%esi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 68(%esp,%esi), %ebp
+; i686-NEXT:    leal (%ebp,%ebp), %edi
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    orl %eax, %edi
+; i686-NEXT:    movl 40(%esp,%ebx), %eax
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    movl 60(%esp,%esi), %ecx
+; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; i686-NEXT:    movl 72(%esp,%esi), %esi
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrdl %cl, %esi, %ebp
+; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %ebx, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    shrl %cl, %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl %esi, 28(%ecx)
+; i686-NEXT:    movl %ebp, 24(%ecx)
+; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
+; i686-NEXT:    movl %edx, 16(%ecx)
+; i686-NEXT:    movl %eax, 12(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 8(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, (%ecx)
+; i686-NEXT:    movl %edi, 20(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 4(%ecx)
+; i686-NEXT:    addl $92, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -380,92 +396,97 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $112, %esp
-; i686-NEXT:    movl 40(%ebp), %edx
-; i686-NEXT:    movl 24(%ebp), %eax
-; i686-NEXT:    movl 28(%ebp), %ecx
-; i686-NEXT:    movl 32(%ebp), %esi
-; i686-NEXT:    movl 16(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 12(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 8(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 20(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 36(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    subl $92, %esp
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %ebp
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edx, %eax
-; i686-NEXT:    andl $31, %eax
-; i686-NEXT:    shrl $3, %edx
-; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 40(%esp,%edx), %esi
-; i686-NEXT:    movl 36(%esp,%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 32(%esp,%edx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 44(%esp,%edx), %edx
-; i686-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %edx, %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 56(%ebp), %edx
-; i686-NEXT:    movl %edx, %ebx
-; i686-NEXT:    andl $31, %ebx
-; i686-NEXT:    shrl $3, %edx
-; i686-NEXT:    andl $12, %edx
-; i686-NEXT:    movl 72(%esp,%edx), %esi
-; i686-NEXT:    movl 68(%esp,%edx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrdl %cl, %esi, %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 64(%esp,%edx), %ecx
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, %ecx
+; i686-NEXT:    andl $7, %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 76(%esp,%edx), %edx
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    shrl $3, %edi
+; i686-NEXT:    andl $15, %edi
+; i686-NEXT:    movl 32(%esp,%edi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 36(%esp,%edi), %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    addl %edx, %edx
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %esi, %edx
+; i686-NEXT:    andl $7, %edx
+; i686-NEXT:    shrl $3, %esi
+; i686-NEXT:    andl $15, %esi
+; i686-NEXT:    movl 64(%esp,%esi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    notb %cl
+; i686-NEXT:    movl 68(%esp,%esi), %ebp
+; i686-NEXT:    leal (%ebp,%ebp), %ebx
+; i686-NEXT:    shll %cl, %ebx
+; i686-NEXT:    orl %eax, %ebx
+; i686-NEXT:    movl 28(%esp,%edi), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 40(%esp,%edi), %edi
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    sarl %cl, (%esp) # 4-byte Folded Spill
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; i686-NEXT:    movl 60(%esp,%esi), %ecx
+; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; i686-NEXT:    movl 72(%esp,%esi), %esi
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    shrdl %cl, %esi, %ebp
+; i686-NEXT:    movl %eax, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, %edi
-; i686-NEXT:    sarl %cl, %edx
-; i686-NEXT:    movl 72(%ebp), %eax
-; i686-NEXT:    movl %edx, 28(%eax)
-; i686-NEXT:    movl %esi, 24(%eax)
+; i686-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
-; i686-NEXT:    movl %edi, 16(%eax)
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    sarl %cl, %edi
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    sarl %cl, %esi
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; i686-NEXT:    movl %esi, 28(%eax)
+; i686-NEXT:    movl %ebp, 24(%eax)
 ; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 12(%eax)
+; i686-NEXT:    movl %ecx, 16(%eax)
+; i686-NEXT:    movl %edi, 12(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, 8(%eax)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    leal -12(%ebp), %esp
+; i686-NEXT:    movl %ebx, 20(%eax)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl %ecx, 4(%eax)
+; i686-NEXT:    addl $92, %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -549,8 +570,17 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl %edi, %ecx
 ; i686-NEXT:    andl $31, %ecx
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shll %cl, %edx
+; i686-NEXT:    movl 4(%eax), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrl %esi
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %edx, %eax
+; i686-NEXT:    notb %cl
+; i686-NEXT:    shrl %cl, %esi
+; i686-NEXT:    orl %edx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; i686-NEXT:    movl (%eax), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl 56(%ebp), %eax
 ; i686-NEXT:    movl %eax, %edx
@@ -570,12 +600,14 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    andl $31, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %ecx, %eax
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    shldl %cl, %edi, %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, %eax
+; i686-NEXT:    shrl %eax
+; i686-NEXT:    notb %cl
+; i686-NEXT:    shrl %cl, %eax
+; i686-NEXT:    orl %edi, %eax
+; i686-NEXT:    movl (%esi), %ecx
+; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT:    movl %esi, %edi
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    shll %cl, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill

>From 53347b13fcbc1f7bba1e1d6ab2e7d9dd979d1b01 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 11 Sep 2024 21:18:17 -0700
Subject: [PATCH 36/56] Narrowing vectors isn't profitable | Update an
 extraordinary number of tests

---
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  2 ++
 llvm/test/CodeGen/X86/avx512-mask-op.ll       | 34 +++++++++----------
 llvm/test/CodeGen/X86/avx512-select.ll        |  8 ++---
 .../test/CodeGen/X86/bitcast-and-setcc-256.ll |  4 +--
 llvm/test/CodeGen/X86/fold-select.ll          |  4 +--
 .../CodeGen/X86/srem-seteq-vec-nonsplat.ll    | 12 +++----
 6 files changed, 32 insertions(+), 32 deletions(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index de8cfe31a5529f..f07d5c43209b52 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34901,6 +34901,8 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
                                               EVT DestVT) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
+  if (SrcVT.isVector() || DestVT.isVector())
+    return false;
   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
 }
 
diff --git a/llvm/test/CodeGen/X86/avx512-mask-op.ll b/llvm/test/CodeGen/X86/avx512-mask-op.ll
index 9e689341f7b88e..856c5d96df5df7 100644
--- a/llvm/test/CodeGen/X86/avx512-mask-op.ll
+++ b/llvm/test/CodeGen/X86/avx512-mask-op.ll
@@ -1952,15 +1952,15 @@ declare void @f2(i32) #1
 define void @store_i16_i1(i16 %x, ptr%y) {
 ; CHECK-LABEL: store_i16_i1:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    andb $1, %dil
 ; CHECK-NEXT:    movb %dil, (%rsi)
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: store_i16_i1:
 ; X86:       ## %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movzwl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    andl $1, %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    andb $1, %cl
 ; X86-NEXT:    movb %cl, (%eax)
 ; X86-NEXT:    retl
   %c = trunc i16 %x to i1
@@ -1971,7 +1971,7 @@ define void @store_i16_i1(i16 %x, ptr%y) {
 define void @store_i8_i1(i8 %x, ptr%y) {
 ; CHECK-LABEL: store_i8_i1:
 ; CHECK:       ## %bb.0:
-; CHECK-NEXT:    andl $1, %edi
+; CHECK-NEXT:    andb $1, %dil
 ; CHECK-NEXT:    movb %dil, (%rsi)
 ; CHECK-NEXT:    retq
 ;
@@ -3936,7 +3936,7 @@ define i8 @test_v8i1_add(i8 %x, i8 %y) {
 ; CHECK-LABEL: test_v8i1_add:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -3956,7 +3956,7 @@ define i8 @test_v8i1_sub(i8 %x, i8 %y) {
 ; CHECK-LABEL: test_v8i1_sub:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -3976,7 +3976,7 @@ define i8 @test_v8i1_mul(i8 %x, i8 %y) {
 ; CHECK-LABEL: test_v8i1_mul:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    andb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5128,7 +5128,7 @@ define i1 @test_v1i1_add(i1 %x, i1 %y) {
 ; CHECK-LABEL: test_v1i1_add:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5148,7 +5148,7 @@ define i1 @test_v1i1_sub(i1 %x, i1 %y) {
 ; CHECK-LABEL: test_v1i1_sub:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    xorl %esi, %eax
+; CHECK-NEXT:    xorb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5168,7 +5168,7 @@ define i1 @test_v1i1_mul(i1 %x, i1 %y) {
 ; CHECK-LABEL: test_v1i1_mul:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    andl %esi, %eax
+; CHECK-NEXT:    andb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
@@ -5188,15 +5188,14 @@ define <1 x i1> @uadd_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind {
 ; CHECK-LABEL: uadd_sat_v1i1:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: uadd_sat_v1i1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
   %z = call <1 x i1> @llvm.uadd.sat.v1i1(<1 x i1> %x, <1 x i1> %y)
   ret <1 x i1> %z
@@ -5257,15 +5256,14 @@ define <1 x i1> @sadd_sat_v1i1(<1 x i1> %x, <1 x i1> %y) nounwind {
 ; CHECK-LABEL: sadd_sat_v1i1:
 ; CHECK:       ## %bb.0:
 ; CHECK-NEXT:    movl %edi, %eax
-; CHECK-NEXT:    orl %esi, %eax
+; CHECK-NEXT:    orb %sil, %al
 ; CHECK-NEXT:    ## kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retq
 ;
 ; X86-LABEL: sadd_sat_v1i1:
 ; X86:       ## %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    orl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    ## kill: def $al killed $al killed $eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    orb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
   %z = call <1 x i1> @llvm.sadd.sat.v1i1(<1 x i1> %x, <1 x i1> %y)
   ret <1 x i1> %z
diff --git a/llvm/test/CodeGen/X86/avx512-select.ll b/llvm/test/CodeGen/X86/avx512-select.ll
index 536c667c7ec902..2c75d68428cb80 100644
--- a/llvm/test/CodeGen/X86/avx512-select.ll
+++ b/llvm/test/CodeGen/X86/avx512-select.ll
@@ -137,8 +137,8 @@ define i8 @select05(i8 %a.0, i8 %m) {
 ;
 ; X64-LABEL: select05:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    orl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    orb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
@@ -212,8 +212,8 @@ define i8 @select06(i8 %a.0, i8 %m) {
 ;
 ; X64-LABEL: select06:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    andl %esi, %eax
+; X64-NEXT:    movl %esi, %eax
+; X64-NEXT:    andb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %mask = bitcast i8 %m to <8 x i1>
diff --git a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
index 234c7a0a500d30..d81a81ee93c6ed 100644
--- a/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
+++ b/llvm/test/CodeGen/X86/bitcast-and-setcc-256.ll
@@ -873,7 +873,7 @@ define i8 @v4i32_concat_undef(<4 x i32> %vec) {
 ; SSE2-SSSE3-NEXT:    pxor %xmm1, %xmm1
 ; SSE2-SSSE3-NEXT:    pcmpeqd %xmm0, %xmm1
 ; SSE2-SSSE3-NEXT:    movmskps %xmm1, %eax
-; SSE2-SSSE3-NEXT:    xorl $15, %eax
+; SSE2-SSSE3-NEXT:    xorb $15, %al
 ; SSE2-SSSE3-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE2-SSSE3-NEXT:    retq
 ;
@@ -882,7 +882,7 @@ define i8 @v4i32_concat_undef(<4 x i32> %vec) {
 ; AVX12-NEXT:    vpxor %xmm1, %xmm1, %xmm1
 ; AVX12-NEXT:    vpcmpeqd %xmm1, %xmm0, %xmm0
 ; AVX12-NEXT:    vmovmskps %xmm0, %eax
-; AVX12-NEXT:    xorl $15, %eax
+; AVX12-NEXT:    xorb $15, %al
 ; AVX12-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX12-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/fold-select.ll b/llvm/test/CodeGen/X86/fold-select.ll
index 31afe979a33b35..cc9a525ba1f39a 100644
--- a/llvm/test/CodeGen/X86/fold-select.ll
+++ b/llvm/test/CodeGen/X86/fold-select.ll
@@ -19,8 +19,8 @@ define <8 x float> @select_and_v8i1(<8 x i1> %a, <8 x i1> %b, <8 x i1> %c, <8 x
 define <8 x float> @select_and_v8i1_2(i8 %m1, i8 %m2, i8 %m3, <8 x float> %d) {
 ; CHECK-LABEL: select_and_v8i1_2:
 ; CHECK:       # %bb.0:
-; CHECK-NEXT:    orl %esi, %edi
-; CHECK-NEXT:    andl %edx, %edi
+; CHECK-NEXT:    orb %sil, %dil
+; CHECK-NEXT:    andb %dl, %dil
 ; CHECK-NEXT:    kmovd %edi, %k1
 ; CHECK-NEXT:    vbroadcastss {{.*#+}} ymm1 = [1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0,1.0E+0]
 ; CHECK-NEXT:    vmovaps %ymm0, %ymm1 {%k1}
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 6fe45b37f030e8..220c2e5012ea49 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2476,14 +2476,14 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpackuswb %ymm3, %ymm2, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpminub {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm2, %ymm3
 ; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpxor %xmm3, %xmm3, %xmm3
-; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm3, %ymm0, %ymm0
-; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm3, %ymm1, %ymm1
 ; CHECK-AVX512VL-NEXT:    vmovdqa {{.*#+}} ymm3 = [255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,255,0,255,255,255,255,255,255,255,255,255,255]
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm3, %ymm2, %ymm2
-; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm0
-; CHECK-AVX512VL-NEXT:    vpternlogq $50, %ymm2, %ymm1, %ymm0
+; CHECK-AVX512VL-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpxor %xmm4, %xmm4, %xmm4
+; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
+; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm3
+; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm0
+; CHECK-AVX512VL-NEXT:    vpternlogq $14, %ymm3, %ymm2, %ymm0
 ; CHECK-AVX512VL-NEXT:    retq
   %rem = srem <32 x i8> %x, <i8 13, i8 5, i8 19, i8 34, i8 2, i8 8, i8 2, i8 88, i8 62, i8 62, i8 5, i8 7, i8 97, i8 2, i8 3, i8 60, i8 3, i8 87, i8 7, i8 6, i8 84, i8 -128, i8 127, i8 56, i8 114, i8 1, i8 50, i8 7, i8 2, i8 8, i8 97, i8 117>
   %cmp = icmp ne <32 x i8> %rem, zeroinitializer

>From 1f70ab400b7cdd798dd2e94b820da09034835ae6 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Fri, 13 Sep 2024 18:19:24 -0700
Subject: [PATCH 37/56] Changes with additional mov instructions

---
 .../X86/8bit_cmov_of_trunc_promotion.ll       | 178 +++++++++---------
 llvm/test/CodeGen/X86/movmsk-cmp.ll           |  26 +--
 ...unfold-masked-merge-scalar-variablemask.ll |  24 +--
 3 files changed, 117 insertions(+), 111 deletions(-)

diff --git a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
index 1a8d33f5b3480f..f4b54bf22d0f52 100644
--- a/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
+++ b/llvm/test/CodeGen/X86/8bit_cmov_of_trunc_promotion.ll
@@ -11,62 +11,63 @@
 define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: t0:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    addb %cl, %al
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB0_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
 ; I386-NOCMOV-NEXT:    movl %ecx, %eax
 ; I386-NOCMOV-NEXT:  .LBB0_2:
-; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: t0:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
-; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addb %al, %cl
+; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    movzbl %cl, %ecx
+; I386-CMOV-NEXT:    movzbl %al, %eax
 ; I386-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: t0:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    addb %cl, %al
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB0_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
 ; I686-NOCMOV-NEXT:    movl %ecx, %eax
 ; I686-NOCMOV-NEXT:  .LBB0_2:
-; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-NOCMOV-NEXT:    retl
 ;
 ; I686-CMOV-LABEL: t0:
 ; I686-CMOV:       # %bb.0:
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
-; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addb %al, %cl
+; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    movzbl %cl, %ecx
+; I686-CMOV-NEXT:    movzbl %al, %eax
 ; I686-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: t0:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
-; X86_64-NEXT:    leal (%rsi,%rdx), %eax
-; X86_64-NEXT:    cmpb %al, %cl
+; X86_64-NEXT:    addb %dl, %dil
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %dil, %ecx
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
@@ -84,64 +85,63 @@ define i8 @t0(i32 %a1_wide_orig, i32 %a2_wide_orig, i32 %inc) nounwind {
 define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: neg_only_one_truncation:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    addb %al, %cl
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB1_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
 ; I386-NOCMOV-NEXT:    movl %ecx, %eax
 ; I386-NOCMOV-NEXT:  .LBB1_2:
-; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: neg_only_one_truncation:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addb %al, %cl
 ; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
-; I386-CMOV-NEXT:    cmpb %al, %cl
-; I386-CMOV-NEXT:    movzbl %al, %eax
-; I386-CMOV-NEXT:    cmovgl %ecx, %eax
+; I386-CMOV-NEXT:    cmpb %cl, %al
+; I386-CMOV-NEXT:    movzbl %al, %edx
+; I386-CMOV-NEXT:    movzbl %cl, %eax
+; I386-CMOV-NEXT:    cmovgl %edx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: neg_only_one_truncation:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    addb %al, %cl
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB1_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
 ; I686-NOCMOV-NEXT:    movl %ecx, %eax
 ; I686-NOCMOV-NEXT:  .LBB1_2:
-; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-NOCMOV-NEXT:    retl
 ;
 ; I686-CMOV-LABEL: neg_only_one_truncation:
 ; I686-CMOV:       # %bb.0:
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addb %al, %cl
 ; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
-; I686-CMOV-NEXT:    cmpb %al, %cl
-; I686-CMOV-NEXT:    movzbl %al, %eax
-; I686-CMOV-NEXT:    cmovgl %ecx, %eax
+; I686-CMOV-NEXT:    cmpb %cl, %al
+; I686-CMOV-NEXT:    movzbl %al, %edx
+; I686-CMOV-NEXT:    movzbl %cl, %eax
+; I686-CMOV-NEXT:    cmovgl %edx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: neg_only_one_truncation:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
-; X86_64-NEXT:    addb %sil, %dl
-; X86_64-NEXT:    cmpb %dl, %cl
-; X86_64-NEXT:    movzbl %dl, %eax
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    addb %dl, %dil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %dil, %ecx
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
@@ -159,62 +159,63 @@ define i8 @neg_only_one_truncation(i32 %a1_wide_orig, i8 %a2_orig, i32 %inc) nou
 define i8 @neg_type_mismatch(i32 %a1_wide_orig, i16 %a2_wide_orig, i32 %inc) nounwind {
 ; I386-NOCMOV-LABEL: neg_type_mismatch:
 ; I386-NOCMOV:       # %bb.0:
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    addl %ecx, %eax
-; I386-NOCMOV-NEXT:    addw {{[0-9]+}}(%esp), %cx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-NOCMOV-NEXT:    addb %cl, %al
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB2_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
 ; I386-NOCMOV-NEXT:    movl %ecx, %eax
 ; I386-NOCMOV-NEXT:  .LBB2_2:
-; I386-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-NOCMOV-NEXT:    retl
 ;
 ; I386-CMOV-LABEL: neg_type_mismatch:
 ; I386-CMOV:       # %bb.0:
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    addl %eax, %ecx
-; I386-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-CMOV-NEXT:    addb %al, %cl
+; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    movzbl %cl, %ecx
+; I386-CMOV-NEXT:    movzbl %al, %eax
 ; I386-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
 ;
 ; I686-NOCMOV-LABEL: neg_type_mismatch:
 ; I686-NOCMOV:       # %bb.0:
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    addl %ecx, %eax
-; I686-NOCMOV-NEXT:    addw {{[0-9]+}}(%esp), %cx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-NOCMOV-NEXT:    addb %cl, %al
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB2_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
 ; I686-NOCMOV-NEXT:    movl %ecx, %eax
 ; I686-NOCMOV-NEXT:  .LBB2_2:
-; I686-NOCMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-NOCMOV-NEXT:    retl
 ;
 ; I686-CMOV-LABEL: neg_type_mismatch:
 ; I686-CMOV:       # %bb.0:
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    addl %eax, %ecx
-; I686-CMOV-NEXT:    addw {{[0-9]+}}(%esp), %ax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-CMOV-NEXT:    addb %al, %cl
+; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    movzbl %cl, %ecx
+; I686-CMOV-NEXT:    movzbl %al, %eax
 ; I686-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: neg_type_mismatch:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X86_64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X86_64-NEXT:    leal (%rdi,%rdx), %ecx
-; X86_64-NEXT:    leal (%rsi,%rdx), %eax
-; X86_64-NEXT:    cmpb %al, %cl
+; X86_64-NEXT:    addb %dl, %dil
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %dil, %ecx
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %ecx, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
@@ -234,8 +235,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I386-NOCMOV-LABEL: negative_CopyFromReg:
 ; I386-NOCMOV:       # %bb.0:
 ; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I386-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I386-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I386-NOCMOV-NEXT:    cmpb %cl, %al
 ; I386-NOCMOV-NEXT:    jg .LBB3_2
 ; I386-NOCMOV-NEXT:  # %bb.1:
@@ -246,9 +247,10 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I386-CMOV-LABEL: negative_CopyFromReg:
 ; I386-CMOV:       # %bb.0:
 ; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I386-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I386-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I386-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I386-CMOV-NEXT:    cmpb %al, %cl
+; I386-CMOV-NEXT:    movzbl %al, %eax
 ; I386-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I386-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I386-CMOV-NEXT:    retl
@@ -256,8 +258,8 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I686-NOCMOV-LABEL: negative_CopyFromReg:
 ; I686-NOCMOV:       # %bb.0:
 ; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
-; I686-NOCMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-NOCMOV-NEXT:    addl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; I686-NOCMOV-NEXT:    addb {{[0-9]+}}(%esp), %cl
 ; I686-NOCMOV-NEXT:    cmpb %cl, %al
 ; I686-NOCMOV-NEXT:    jg .LBB3_2
 ; I686-NOCMOV-NEXT:  # %bb.1:
@@ -268,19 +270,19 @@ define i8 @negative_CopyFromReg(i32 %a1_wide, i32 %a2_wide_orig, i32 %inc) nounw
 ; I686-CMOV-LABEL: negative_CopyFromReg:
 ; I686-CMOV:       # %bb.0:
 ; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; I686-CMOV-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; I686-CMOV-NEXT:    addl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; I686-CMOV-NEXT:    addb {{[0-9]+}}(%esp), %al
 ; I686-CMOV-NEXT:    cmpb %al, %cl
+; I686-CMOV-NEXT:    movzbl %al, %eax
 ; I686-CMOV-NEXT:    cmovgl %ecx, %eax
 ; I686-CMOV-NEXT:    # kill: def $al killed $al killed $eax
 ; I686-CMOV-NEXT:    retl
 ;
 ; X86_64-LABEL: negative_CopyFromReg:
 ; X86_64:       # %bb.0:
-; X86_64-NEXT:    # kill: def $edx killed $edx def $rdx
-; X86_64-NEXT:    # kill: def $esi killed $esi def $rsi
-; X86_64-NEXT:    leal (%rsi,%rdx), %eax
-; X86_64-NEXT:    cmpb %al, %dil
+; X86_64-NEXT:    addb %dl, %sil
+; X86_64-NEXT:    cmpb %sil, %dil
+; X86_64-NEXT:    movzbl %sil, %eax
 ; X86_64-NEXT:    cmovgl %edi, %eax
 ; X86_64-NEXT:    # kill: def $al killed $al killed $eax
 ; X86_64-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/movmsk-cmp.ll b/llvm/test/CodeGen/X86/movmsk-cmp.ll
index 9b624a935bada8..7b94072b219237 100644
--- a/llvm/test/CodeGen/X86/movmsk-cmp.ll
+++ b/llvm/test/CodeGen/X86/movmsk-cmp.ll
@@ -3655,11 +3655,11 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; SSE-NEXT:    shrl $15, %ecx
 ; SSE-NEXT:    movl %eax, %edx
 ; SSE-NEXT:    shrl $8, %edx
-; SSE-NEXT:    andl $1, %edx
+; SSE-NEXT:    andb $1, %dl
 ; SSE-NEXT:    andl $8, %eax
 ; SSE-NEXT:    shrl $3, %eax
-; SSE-NEXT:    xorl %edx, %eax
-; SSE-NEXT:    andl %ecx, %eax
+; SSE-NEXT:    xorb %dl, %al
+; SSE-NEXT:    andb %cl, %al
 ; SSE-NEXT:    # kill: def $al killed $al killed $eax
 ; SSE-NEXT:    retq
 ;
@@ -3671,11 +3671,11 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; AVX1OR2-NEXT:    shrl $15, %ecx
 ; AVX1OR2-NEXT:    movl %eax, %edx
 ; AVX1OR2-NEXT:    shrl $8, %edx
-; AVX1OR2-NEXT:    andl $1, %edx
+; AVX1OR2-NEXT:    andb $1, %dl
 ; AVX1OR2-NEXT:    andl $8, %eax
 ; AVX1OR2-NEXT:    shrl $3, %eax
-; AVX1OR2-NEXT:    xorl %edx, %eax
-; AVX1OR2-NEXT:    andl %ecx, %eax
+; AVX1OR2-NEXT:    xorb %dl, %al
+; AVX1OR2-NEXT:    andb %cl, %al
 ; AVX1OR2-NEXT:    # kill: def $al killed $al killed $eax
 ; AVX1OR2-NEXT:    retq
 ;
@@ -3685,8 +3685,8 @@ define i1 @movmsk_v16i8(<16 x i8> %x, <16 x i8> %y) {
 ; KNL-NEXT:    vpextrb $15, %xmm0, %ecx
 ; KNL-NEXT:    vpextrb $8, %xmm0, %edx
 ; KNL-NEXT:    vpextrb $3, %xmm0, %eax
-; KNL-NEXT:    xorl %edx, %eax
-; KNL-NEXT:    andl %ecx, %eax
+; KNL-NEXT:    xorb %dl, %al
+; KNL-NEXT:    andb %cl, %al
 ; KNL-NEXT:    # kill: def $al killed $al killed $eax
 ; KNL-NEXT:    retq
 ;
@@ -4171,7 +4171,9 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; SSE2-NEXT:    pshufd {{.*#+}} xmm1 = xmm0[1,0,3,2]
 ; SSE2-NEXT:    pand %xmm0, %xmm1
 ; SSE2-NEXT:    movmskpd %xmm1, %eax
-; SSE2-NEXT:    xorl $3, %eax
+; SSE2-NEXT:    xorb $3, %al
+; Is this being performed by a different transofmrations that's using isNarrowingProfitable()?
+; SSE2-NEXT:    movzbl %al, %eax
 ; SSE2-NEXT:    btl %edi, %eax
 ; SSE2-NEXT:    setb %al
 ; SSE2-NEXT:    retq
@@ -4180,7 +4182,8 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; SSE41:       # %bb.0:
 ; SSE41-NEXT:    pcmpeqq %xmm1, %xmm0
 ; SSE41-NEXT:    movmskpd %xmm0, %eax
-; SSE41-NEXT:    xorl $3, %eax
+; SSE41-NEXT:    xorb $3, %al
+; SSE41-NEXT:    movzbl %al, %eax
 ; SSE41-NEXT:    btl %edi, %eax
 ; SSE41-NEXT:    setb %al
 ; SSE41-NEXT:    retq
@@ -4189,7 +4192,8 @@ define i1 @movmsk_v2i64_var(<2 x i64> %x, <2 x i64> %y, i32 %z) {
 ; AVX1OR2:       # %bb.0:
 ; AVX1OR2-NEXT:    vpcmpeqq %xmm1, %xmm0, %xmm0
 ; AVX1OR2-NEXT:    vmovmskpd %xmm0, %eax
-; AVX1OR2-NEXT:    xorl $3, %eax
+; AVX1OR2-NEXT:    xorb $3, %al
+; AVX1OR2-NEXT:    movzbl %al, %eax
 ; AVX1OR2-NEXT:    btl %edi, %eax
 ; AVX1OR2-NEXT:    setb %al
 ; AVX1OR2-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
index 9c9d06921096cb..6889fe1edff4b3 100644
--- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
+++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-variablemask.ll
@@ -6,18 +6,17 @@
 define i8 @out8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: out8:
 ; CHECK-NOBMI:       # %bb.0:
-; CHECK-NOBMI-NEXT:    movl %edx, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %edi
-; CHECK-NOBMI-NEXT:    notb %al
-; CHECK-NOBMI-NEXT:    andb %sil, %al
-; CHECK-NOBMI-NEXT:    orb %dil, %al
+; CHECK-NOBMI-NEXT:    movl %edi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb %dl, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: out8:
 ; CHECK-BMI:       # %bb.0:
 ; CHECK-BMI-NEXT:    movl %edx, %eax
-; CHECK-BMI-NEXT:    andl %edx, %edi
+; CHECK-BMI-NEXT:    andb %al, %dil
 ; CHECK-BMI-NEXT:    notb %al
 ; CHECK-BMI-NEXT:    andb %sil, %al
 ; CHECK-BMI-NEXT:    orb %dil, %al
@@ -106,17 +105,18 @@ define i8 @in8(i8 %x, i8 %y, i8 %mask) {
 ; CHECK-NOBMI-LABEL: in8:
 ; CHECK-NOBMI:       # %bb.0:
 ; CHECK-NOBMI-NEXT:    movl %edi, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
-; CHECK-NOBMI-NEXT:    andl %edx, %eax
-; CHECK-NOBMI-NEXT:    xorl %esi, %eax
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
+; CHECK-NOBMI-NEXT:    andb %dl, %al
+; CHECK-NOBMI-NEXT:    xorb %sil, %al
 ; CHECK-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NOBMI-NEXT:    retq
 ;
 ; CHECK-BMI-LABEL: in8:
 ; CHECK-BMI:       # %bb.0:
-; CHECK-BMI-NEXT:    andnl %esi, %edx, %eax
-; CHECK-BMI-NEXT:    andl %edx, %edi
-; CHECK-BMI-NEXT:    orl %edi, %eax
+; CHECK-BMI-NEXT:    movl %edi, %eax
+; CHECK-BMI-NEXT:    xorb %sil, %al
+; CHECK-BMI-NEXT:    andb %dl, %al
+; CHECK-BMI-NEXT:    xorb %sil, %al
 ; CHECK-BMI-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-BMI-NEXT:    retq
   %n0 = xor i8 %x, %y

>From b7dad1c7ba5542a43adce13ee23438b4de134477 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Fri, 13 Sep 2024 18:20:22 -0700
Subject: [PATCH 38/56] Changes with non-trivial diffs

---
 llvm/test/CodeGen/X86/ucmp.ll | 2191 +++++++++------------------------
 1 file changed, 560 insertions(+), 1631 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index cd643cb8d63751..8db4213e66a5d8 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -1470,1488 +1470,410 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
 }
 
 define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
-; SSE4-LABEL: ucmp_uncommon_vectors:
-; SSE4:       # %bb.0:
-; SSE4-NEXT:    pushq %rbp
-; SSE4-NEXT:    pushq %r15
-; SSE4-NEXT:    pushq %r14
-; SSE4-NEXT:    pushq %r13
-; SSE4-NEXT:    pushq %r12
-; SSE4-NEXT:    pushq %rbx
-; SSE4-NEXT:    subq $120, %rsp
-; SSE4-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    andl $127, %edx
-; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    andl $127, %r8d
-; SSE4-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE4-NEXT:    andl $127, %r10d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    andl $127, %eax
-; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    andl $127, %ecx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE4-NEXT:    andl $127, %r8d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE4-NEXT:    andl $127, %ebx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT:    andl $127, %edx
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT:    andl $127, %r13d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    andl $127, %r11d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    andl $127, %r14d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    andl $127, %r12d
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE4-NEXT:    cmpq %rax, %rbp
-; SSE4-NEXT:    movq %r12, %r15
-; SSE4-NEXT:    sbbq %r14, %r15
-; SSE4-NEXT:    setb %r15b
-; SSE4-NEXT:    cmpq %rbp, %rax
-; SSE4-NEXT:    sbbq %r12, %r14
-; SSE4-NEXT:    sbbb $0, %r15b
-; SSE4-NEXT:    movb %r15b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
-; SSE4-NEXT:    movq %r11, %r15
-; SSE4-NEXT:    sbbq %r13, %r15
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r11, %r13
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE4-NEXT:    cmpq %rax, %r11
-; SSE4-NEXT:    movq %rdx, %r14
-; SSE4-NEXT:    sbbq %rbx, %r14
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %r11, %rax
-; SSE4-NEXT:    sbbq %rdx, %rbx
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movb %bpl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE4-NEXT:    cmpq %rax, %rdx
-; SSE4-NEXT:    movq %r8, %r11
-; SSE4-NEXT:    sbbq %rcx, %r11
-; SSE4-NEXT:    setb %r11b
-; SSE4-NEXT:    cmpq %rdx, %rax
-; SSE4-NEXT:    sbbq %r8, %rcx
-; SSE4-NEXT:    sbbb $0, %r11b
-; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rdx
-; SSE4-NEXT:    sbbq %r10, %rdx
-; SSE4-NEXT:    setb %dl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r8, %r10
-; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %r10b
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %r10b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %dl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %bpl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %bpl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE4-NEXT:    movq %r11, %rdx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rdx
-; SSE4-NEXT:    setb %dl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r11, %r8
-; SSE4-NEXT:    sbbb $0, %dl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT:    movq %r14, %r8
-; SSE4-NEXT:    movq (%rsp), %rbx # 8-byte Reload
-; SSE4-NEXT:    sbbq %rbx, %r8
-; SSE4-NEXT:    setb %r11b
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %rbx
-; SSE4-NEXT:    sbbb $0, %r11b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE4-NEXT:    cmpq %rax, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE4-NEXT:    movq %r14, %rbx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rbx
-; SSE4-NEXT:    setb %bl
-; SSE4-NEXT:    cmpq %rcx, %rax
-; SSE4-NEXT:    sbbq %r14, %r8
-; SSE4-NEXT:    sbbb $0, %bl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE4-NEXT:    cmpq %rax, %r14
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE4-NEXT:    movq %r15, %rcx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %rcx
-; SSE4-NEXT:    setb %cl
-; SSE4-NEXT:    cmpq %r14, %rax
-; SSE4-NEXT:    sbbq %r15, %r8
-; SSE4-NEXT:    sbbb $0, %cl
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE4-NEXT:    cmpq %rax, %r15
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r14
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r14
-; SSE4-NEXT:    setb %r14b
-; SSE4-NEXT:    cmpq %r15, %rax
-; SSE4-NEXT:    sbbq %r12, %r8
-; SSE4-NEXT:    sbbb $0, %r14b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    cmpq %r9, %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    movq %r12, %r15
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r15
-; SSE4-NEXT:    setb %r15b
-; SSE4-NEXT:    cmpq %rax, %r9
-; SSE4-NEXT:    sbbq %r12, %r8
-; SSE4-NEXT:    sbbb $0, %r15b
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE4-NEXT:    cmpq %r12, %rax
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE4-NEXT:    movq %r13, %r9
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    sbbq %r8, %r9
-; SSE4-NEXT:    setb %r9b
-; SSE4-NEXT:    cmpq %rax, %r12
-; SSE4-NEXT:    sbbq %r13, %r8
-; SSE4-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    sbbb $0, %r9b
-; SSE4-NEXT:    cmpq %rsi, %r12
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rdi
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rdi
-; SSE4-NEXT:    setb %dil
-; SSE4-NEXT:    cmpq %r12, %rsi
-; SSE4-NEXT:    sbbq %r8, %rax
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE4-NEXT:    sbbb $0, %dil
-; SSE4-NEXT:    cmpq %r12, %r13
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE4-NEXT:    movq %r8, %rsi
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    sbbq %rax, %rsi
-; SSE4-NEXT:    setb %sil
-; SSE4-NEXT:    cmpq %r13, %r12
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm1
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm2
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm3
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm4
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r12d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r12d, %xmm5
-; SSE4-NEXT:    movzbl %r10b, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm6
-; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %r10d # 1-byte Folded Reload
-; SSE4-NEXT:    movd %r10d, %xmm7
-; SSE4-NEXT:    movzbl %bpl, %r10d
-; SSE4-NEXT:    movd %r10d, %xmm0
-; SSE4-NEXT:    movzbl %dl, %edx
-; SSE4-NEXT:    movd %edx, %xmm8
-; SSE4-NEXT:    movzbl %r11b, %edx
-; SSE4-NEXT:    movd %edx, %xmm9
-; SSE4-NEXT:    movzbl %bl, %edx
-; SSE4-NEXT:    movd %edx, %xmm10
-; SSE4-NEXT:    movzbl %cl, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm11
-; SSE4-NEXT:    movzbl %r14b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm12
-; SSE4-NEXT:    movzbl %r15b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm13
-; SSE4-NEXT:    movzbl %r9b, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm14
-; SSE4-NEXT:    movzbl %dil, %ecx
-; SSE4-NEXT:    movd %ecx, %xmm15
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm2 = xmm2[0],xmm1[0],xmm2[1],xmm1[1],xmm2[2],xmm1[2],xmm2[3],xmm1[3],xmm2[4],xmm1[4],xmm2[5],xmm1[5],xmm2[6],xmm1[6],xmm2[7],xmm1[7]
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm4 = xmm4[0],xmm3[0],xmm4[1],xmm3[1],xmm4[2],xmm3[2],xmm4[3],xmm3[3],xmm4[4],xmm3[4],xmm4[5],xmm3[5],xmm4[6],xmm3[6],xmm4[7],xmm3[7]
-; SSE4-NEXT:    punpcklwd {{.*#+}} xmm4 = xmm4[0],xmm2[0],xmm4[1],xmm2[1],xmm4[2],xmm2[2],xmm4[3],xmm2[3]
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm6 = xmm6[0],xmm5[0],xmm6[1],xmm5[1],xmm6[2],xmm5[2],xmm6[3],xmm5[3],xmm6[4],xmm5[4],xmm6[5],xmm5[5],xmm6[6],xmm5[6],xmm6[7],xmm5[7]
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm0 = xmm0[0],xmm7[0],xmm0[1],xmm7[1],xmm0[2],xmm7[2],xmm0[3],xmm7[3],xmm0[4],xmm7[4],xmm0[5],xmm7[5],xmm0[6],xmm7[6],xmm0[7],xmm7[7]
-; SSE4-NEXT:    punpcklwd {{.*#+}} xmm0 = xmm0[0],xmm6[0],xmm0[1],xmm6[1],xmm0[2],xmm6[2],xmm0[3],xmm6[3]
-; SSE4-NEXT:    punpckldq {{.*#+}} xmm0 = xmm0[0],xmm4[0],xmm0[1],xmm4[1]
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; SSE4-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; SSE4-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; SSE4-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
-; SSE4-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; SSE4-NEXT:    sbbq %r8, %rax
-; SSE4-NEXT:    sbbb $0, %sil
-; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm0[0]
-; SSE4-NEXT:    movzbl %sil, %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; SSE4-NEXT:    movb %cl, 4(%rax)
-; SSE4-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    leaq (%rdx,%rcx,4), %rcx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $4, %edx
-; SSE4-NEXT:    orq %rcx, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $6, %ecx
-; SSE4-NEXT:    orq %rdx, %rcx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $8, %edx
-; SSE4-NEXT:    orq %rcx, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $10, %ecx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $12, %esi
-; SSE4-NEXT:    orq %rcx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
-; SSE4-NEXT:    andl $3, %edi
-; SSE4-NEXT:    shll $14, %edi
-; SSE4-NEXT:    orq %rsi, %rdi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shll $16, %ecx
-; SSE4-NEXT:    orq %rdi, %rcx
-; SSE4-NEXT:    orq %rdx, %rcx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $18, %edx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $20, %esi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shll $22, %edx
-; SSE4-NEXT:    orq %rsi, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shll $24, %esi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; SSE4-NEXT:    andl $3, %edx
-; SSE4-NEXT:    shlq $26, %rdx
-; SSE4-NEXT:    orq %rsi, %rdx
-; SSE4-NEXT:    orq %rcx, %rdx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; SSE4-NEXT:    andl $3, %ecx
-; SSE4-NEXT:    shlq $28, %rcx
-; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; SSE4-NEXT:    andl $3, %esi
-; SSE4-NEXT:    shlq $30, %rsi
-; SSE4-NEXT:    orq %rcx, %rsi
-; SSE4-NEXT:    orq %rdx, %rsi
-; SSE4-NEXT:    movl %esi, (%rax)
-; SSE4-NEXT:    addq $120, %rsp
-; SSE4-NEXT:    popq %rbx
-; SSE4-NEXT:    popq %r12
-; SSE4-NEXT:    popq %r13
-; SSE4-NEXT:    popq %r14
-; SSE4-NEXT:    popq %r15
-; SSE4-NEXT:    popq %rbp
-; SSE4-NEXT:    retq
-;
-; SSE2-LABEL: ucmp_uncommon_vectors:
-; SSE2:       # %bb.0:
-; SSE2-NEXT:    pushq %rbp
-; SSE2-NEXT:    pushq %r15
-; SSE2-NEXT:    pushq %r14
-; SSE2-NEXT:    pushq %r13
-; SSE2-NEXT:    pushq %r12
-; SSE2-NEXT:    pushq %rbx
-; SSE2-NEXT:    subq $88, %rsp
-; SSE2-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    andl $127, %r8d
-; SSE2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    andl $127, %edx
-; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    andl $127, %ecx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    andl $127, %eax
-; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    andl $127, %ebx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    andl $127, %edx
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; SSE2-NEXT:    andl $127, %r10d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    andl $127, %r14d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; SSE2-NEXT:    andl $127, %ebp
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; SSE2-NEXT:    andl $127, %r13d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; SSE2-NEXT:    andl $127, %r11d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; SSE2-NEXT:    andl $127, %r15d
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; SSE2-NEXT:    cmpq %rax, %r12
-; SSE2-NEXT:    movq %r15, %r8
-; SSE2-NEXT:    sbbq %r11, %r8
-; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %r12, %rax
-; SSE2-NEXT:    sbbq %r15, %r11
-; SSE2-NEXT:    sbbb $0, %r8b
-; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %r13, %r11
-; SSE2-NEXT:    sbbq %rbp, %r11
-; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %r13, %rbp
-; SSE2-NEXT:    sbbb $0, %r11b
-; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %r14, %r11
-; SSE2-NEXT:    sbbq %r10, %r11
-; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %r14, %r10
-; SSE2-NEXT:    sbbb $0, %r11b
-; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; SSE2-NEXT:    cmpq %rax, %r8
-; SSE2-NEXT:    movq %rdx, %r10
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    setb %r10b
-; SSE2-NEXT:    cmpq %r8, %rax
-; SSE2-NEXT:    sbbq %rdx, %rbx
-; SSE2-NEXT:    sbbb $0, %r10b
-; SSE2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; SSE2-NEXT:    cmpq %rax, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    movq %r10, %r8
-; SSE2-NEXT:    sbbq %rcx, %r8
-; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %rdx, %rax
-; SSE2-NEXT:    sbbq %r10, %rcx
-; SSE2-NEXT:    sbbb $0, %r8b
-; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    movq %r10, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r8, %rdx
-; SSE2-NEXT:    setb %dl
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r10, %r8
-; SSE2-NEXT:    sbbb $0, %dl
-; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    movq %r10, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r8, %rdx
-; SSE2-NEXT:    setb %dl
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r10, %r8
-; SSE2-NEXT:    sbbb $0, %dl
-; SSE2-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; SSE2-NEXT:    movq %r11, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %r8b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r11, %r10
-; SSE2-NEXT:    sbbb $0, %r8b
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    movq %rbx, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %r11b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    sbbb $0, %r11b
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    movq %rbx, %rdx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r10, %rdx
-; SSE2-NEXT:    setb %dl
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    sbbb $0, %dl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; SSE2-NEXT:    cmpq %rax, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    movq %r14, %r10
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; SSE2-NEXT:    sbbq %rbx, %r10
-; SSE2-NEXT:    setb %r10b
-; SSE2-NEXT:    cmpq %rcx, %rax
-; SSE2-NEXT:    sbbq %r14, %rbx
-; SSE2-NEXT:    sbbb $0, %r10b
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; SSE2-NEXT:    cmpq %rax, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    movq %r15, %rcx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r14, %rcx
-; SSE2-NEXT:    setb %cl
-; SSE2-NEXT:    cmpq %rbx, %rax
-; SSE2-NEXT:    sbbq %r15, %r14
-; SSE2-NEXT:    sbbb $0, %cl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    cmpq %rax, %r14
-; SSE2-NEXT:    movq (%rsp), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %rbx
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r15, %rbx
-; SSE2-NEXT:    setb %bl
-; SSE2-NEXT:    cmpq %r14, %rax
-; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    sbbb $0, %bl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    cmpq %r9, %rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %r14
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r15, %r14
-; SSE2-NEXT:    setb %bpl
-; SSE2-NEXT:    cmpq %rax, %r9
-; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    sbbb $0, %bpl
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; SSE2-NEXT:    cmpq %rsi, %rax
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    movq %r15, %r9
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r14, %r9
-; SSE2-NEXT:    setb %r9b
-; SSE2-NEXT:    cmpq %rax, %rsi
-; SSE2-NEXT:    sbbq %r15, %r14
-; SSE2-NEXT:    movq %rdi, %rax
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    sbbb $0, %r9b
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; SSE2-NEXT:    cmpq %r15, %rsi
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    movq %r12, %rdi
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r14, %rdi
-; SSE2-NEXT:    setb %dil
-; SSE2-NEXT:    cmpq %rsi, %r15
-; SSE2-NEXT:    sbbq %r12, %r14
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; SSE2-NEXT:    sbbb $0, %dil
-; SSE2-NEXT:    cmpq %rsi, %r14
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; SSE2-NEXT:    movq %r13, %r15
-; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; SSE2-NEXT:    sbbq %r12, %r15
-; SSE2-NEXT:    setb %r15b
-; SSE2-NEXT:    cmpq %r14, %rsi
-; SSE2-NEXT:    sbbq %r13, %r12
-; SSE2-NEXT:    sbbb $0, %r15b
-; SSE2-NEXT:    movzbl %r15b, %esi
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    movb %sil, 4(%rax)
-; SSE2-NEXT:    movzbl %dil, %esi
-; SSE2-NEXT:    movzbl %r9b, %edi
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    andl $3, %edi
-; SSE2-NEXT:    leaq (%rdi,%rsi,4), %rsi
-; SSE2-NEXT:    movzbl %bpl, %edi
-; SSE2-NEXT:    andl $3, %edi
-; SSE2-NEXT:    shll $4, %edi
-; SSE2-NEXT:    orq %rsi, %rdi
-; SSE2-NEXT:    movzbl %bl, %r9d
-; SSE2-NEXT:    andl $3, %r9d
-; SSE2-NEXT:    shll $6, %r9d
-; SSE2-NEXT:    orq %rdi, %r9
-; SSE2-NEXT:    movzbl %cl, %esi
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shll $8, %esi
-; SSE2-NEXT:    orq %r9, %rsi
-; SSE2-NEXT:    movzbl %dl, %ecx
-; SSE2-NEXT:    movzbl %r10b, %edx
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $10, %edx
-; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shll $12, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    movzbl %r11b, %edx
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $14, %edx
-; SSE2-NEXT:    orq %rcx, %rdx
-; SSE2-NEXT:    movzbl %r8b, %ecx
-; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shll $16, %ecx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    orq %rsi, %rcx
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shll $18, %esi
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $20, %edx
-; SSE2-NEXT:    orq %rsi, %rdx
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shll $22, %esi
-; SSE2-NEXT:    orq %rdx, %rsi
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shll $24, %edx
-; SSE2-NEXT:    orq %rsi, %rdx
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %esi
-; SSE2-NEXT:    shlq $26, %rsi
-; SSE2-NEXT:    orq %rdx, %rsi
-; SSE2-NEXT:    orq %rcx, %rsi
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; SSE2-NEXT:    andl $3, %edx
-; SSE2-NEXT:    shlq $28, %rdx
-; SSE2-NEXT:    andl $3, %ecx
-; SSE2-NEXT:    shlq $30, %rcx
-; SSE2-NEXT:    orq %rdx, %rcx
-; SSE2-NEXT:    orq %rsi, %rcx
-; SSE2-NEXT:    movl %ecx, (%rax)
-; SSE2-NEXT:    addq $88, %rsp
-; SSE2-NEXT:    popq %rbx
-; SSE2-NEXT:    popq %r12
-; SSE2-NEXT:    popq %r13
-; SSE2-NEXT:    popq %r14
-; SSE2-NEXT:    popq %r15
-; SSE2-NEXT:    popq %rbp
-; SSE2-NEXT:    retq
-;
-; AVX2-LABEL: ucmp_uncommon_vectors:
-; AVX2:       # %bb.0:
-; AVX2-NEXT:    pushq %rbp
-; AVX2-NEXT:    pushq %r15
-; AVX2-NEXT:    pushq %r14
-; AVX2-NEXT:    pushq %r13
-; AVX2-NEXT:    pushq %r12
-; AVX2-NEXT:    pushq %rbx
-; AVX2-NEXT:    subq $88, %rsp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andl $127, %r8d
-; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    andl $127, %edx
-; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX2-NEXT:    andl $127, %r15d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    andl $127, %eax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX2-NEXT:    andl $127, %r14d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    andl $127, %edx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX2-NEXT:    andl $127, %ebp
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    andl $127, %r8d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX2-NEXT:    andl $127, %r12d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX2-NEXT:    andl $127, %r13d
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %rbx, %r11
-; AVX2-NEXT:    movq %r13, %r10
-; AVX2-NEXT:    sbbq %r12, %r10
-; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r11, %rbx
-; AVX2-NEXT:    sbbq %r13, %r12
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %r10, %r11
-; AVX2-NEXT:    movq %r8, %rbx
-; AVX2-NEXT:    sbbq %rbp, %rbx
-; AVX2-NEXT:    setb %bl
-; AVX2-NEXT:    cmpq %r11, %r10
-; AVX2-NEXT:    sbbq %r8, %rbp
-; AVX2-NEXT:    sbbb $0, %bl
-; AVX2-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    cmpq %r8, %r10
-; AVX2-NEXT:    movq %rdx, %r11
-; AVX2-NEXT:    sbbq %r14, %r11
-; AVX2-NEXT:    setb %r11b
-; AVX2-NEXT:    cmpq %r10, %r8
-; AVX2-NEXT:    sbbq %rdx, %r14
-; AVX2-NEXT:    sbbb $0, %r11b
-; AVX2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX2-NEXT:    cmpq %rdx, %r8
-; AVX2-NEXT:    movq %rax, %r10
-; AVX2-NEXT:    sbbq %r15, %r10
-; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r8, %rdx
-; AVX2-NEXT:    sbbq %rax, %r15
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r8b
-; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r8b
-; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r8b
-; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r12b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r12b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX2-NEXT:    cmpq %rax, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    movq %r11, %r8
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r10, %r8
-; AVX2-NEXT:    setb %r8b
-; AVX2-NEXT:    cmpq %rdx, %rax
-; AVX2-NEXT:    sbbq %r11, %r10
-; AVX2-NEXT:    sbbb $0, %r8b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX2-NEXT:    cmpq %rax, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    movq %rbx, %rdx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r11, %rdx
-; AVX2-NEXT:    setb %dl
-; AVX2-NEXT:    cmpq %r10, %rax
-; AVX2-NEXT:    sbbq %rbx, %r11
-; AVX2-NEXT:    sbbb $0, %dl
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX2-NEXT:    cmpq %rax, %r11
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    movq %r14, %r10
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX2-NEXT:    sbbq %rbx, %r10
-; AVX2-NEXT:    setb %r10b
-; AVX2-NEXT:    cmpq %r11, %rax
-; AVX2-NEXT:    sbbq %r14, %rbx
-; AVX2-NEXT:    sbbb $0, %r10b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX2-NEXT:    cmpq %rax, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %r11
-; AVX2-NEXT:    movq (%rsp), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %r11
-; AVX2-NEXT:    setb %r11b
-; AVX2-NEXT:    cmpq %rbx, %rax
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    sbbb $0, %r11b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX2-NEXT:    cmpq %rax, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %rbx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %rbx
-; AVX2-NEXT:    setb %bl
-; AVX2-NEXT:    cmpq %r14, %rax
-; AVX2-NEXT:    sbbq %r13, %r15
-; AVX2-NEXT:    sbbb $0, %bl
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    cmpq %r9, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    setb %bpl
-; AVX2-NEXT:    cmpq %rax, %r9
-; AVX2-NEXT:    sbbq %r13, %r15
-; AVX2-NEXT:    sbbb $0, %bpl
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    cmpq %rsi, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %r9
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %r9
-; AVX2-NEXT:    setb %r9b
-; AVX2-NEXT:    cmpq %rax, %rsi
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    sbbb $0, %r9b
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    cmpq %rcx, %rax
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    movq %r15, %rsi
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r14, %rsi
-; AVX2-NEXT:    setb %sil
-; AVX2-NEXT:    cmpq %rax, %rcx
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    sbbb $0, %sil
-; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX2-NEXT:    cmpq %rax, %rcx
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
-; AVX2-NEXT:    movq %r13, %r14
-; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX2-NEXT:    sbbq %r15, %r14
-; AVX2-NEXT:    setb %r14b
-; AVX2-NEXT:    cmpq %rcx, %rax
-; AVX2-NEXT:    sbbq %r13, %r15
-; AVX2-NEXT:    movq %rdi, %rax
-; AVX2-NEXT:    sbbb $0, %r14b
-; AVX2-NEXT:    movzbl %r14b, %ecx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    movb %cl, 4(%rdi)
-; AVX2-NEXT:    movzbl %sil, %ecx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    movzbl %r9b, %esi
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    leaq (%rsi,%rcx,4), %rcx
-; AVX2-NEXT:    movzbl %bpl, %esi
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    shll $4, %esi
-; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    movzbl %bl, %ecx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    shll $6, %ecx
-; AVX2-NEXT:    orq %rsi, %rcx
-; AVX2-NEXT:    movzbl %r11b, %esi
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    shll $8, %esi
-; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    movzbl %r10b, %ecx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    shll $10, %ecx
-; AVX2-NEXT:    movzbl %dl, %edx
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    shll $12, %edx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    movzbl %r8b, %edi
-; AVX2-NEXT:    andl $3, %edi
-; AVX2-NEXT:    shll $14, %edi
-; AVX2-NEXT:    orq %rdx, %rdi
-; AVX2-NEXT:    movzbl %r12b, %ecx
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    shll $16, %ecx
-; AVX2-NEXT:    orq %rdi, %rcx
-; AVX2-NEXT:    orq %rsi, %rcx
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    shll $18, %edx
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    shll $20, %esi
-; AVX2-NEXT:    orq %rdx, %rsi
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    shll $22, %edx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    shll $24, %esi
-; AVX2-NEXT:    orq %rdx, %rsi
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX2-NEXT:    andl $3, %edx
-; AVX2-NEXT:    shlq $26, %rdx
-; AVX2-NEXT:    orq %rsi, %rdx
-; AVX2-NEXT:    orq %rcx, %rdx
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; AVX2-NEXT:    andl $3, %ecx
-; AVX2-NEXT:    shlq $28, %rcx
-; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
-; AVX2-NEXT:    andl $3, %esi
-; AVX2-NEXT:    shlq $30, %rsi
-; AVX2-NEXT:    orq %rcx, %rsi
-; AVX2-NEXT:    orq %rdx, %rsi
-; AVX2-NEXT:    movl %esi, (%rax)
-; AVX2-NEXT:    addq $88, %rsp
-; AVX2-NEXT:    popq %rbx
-; AVX2-NEXT:    popq %r12
-; AVX2-NEXT:    popq %r13
-; AVX2-NEXT:    popq %r14
-; AVX2-NEXT:    popq %r15
-; AVX2-NEXT:    popq %rbp
-; AVX2-NEXT:    retq
-;
-; AVX512-LABEL: ucmp_uncommon_vectors:
-; AVX512:       # %bb.0:
-; AVX512-NEXT:    pushq %rbp
-; AVX512-NEXT:    pushq %r15
-; AVX512-NEXT:    pushq %r14
-; AVX512-NEXT:    pushq %r13
-; AVX512-NEXT:    pushq %r12
-; AVX512-NEXT:    pushq %rbx
-; AVX512-NEXT:    subq $88, %rsp
-; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andl $127, %r8d
-; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    andl $127, %edx
-; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; AVX512-NEXT:    andl $127, %ebp
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; AVX512-NEXT:    andl $127, %r12d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; AVX512-NEXT:    andl $127, %r13d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; AVX512-NEXT:    andl $127, %r15d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    andl $127, %r10d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; AVX512-NEXT:    andl $127, %ebx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    andl $127, %r8d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; AVX512-NEXT:    andl $127, %r9d
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT:    andl $127, %esi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT:    andl $127, %edi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    andl $127, %eax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    andl $127, %edx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    cmpq %r14, %r11
-; AVX512-NEXT:    movq %rdx, %rcx
-; AVX512-NEXT:    sbbq %rax, %rcx
-; AVX512-NEXT:    setb %cl
-; AVX512-NEXT:    cmpq %r11, %r14
-; AVX512-NEXT:    sbbq %rdx, %rax
-; AVX512-NEXT:    sbbb $0, %cl
-; AVX512-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %rdi, %rdx
-; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r9, %rdx
-; AVX512-NEXT:    sbbq %r8, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r9, %r8
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %rbx, %rdx
-; AVX512-NEXT:    sbbq %r10, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rbx, %r10
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r15, %rdx
-; AVX512-NEXT:    sbbq %r13, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r15, %r13
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq %r12, %rdx
-; AVX512-NEXT:    sbbq %rbp, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %r12, %rbp
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %r13b
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %r13b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    cmpq %rax, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    sbbq %rsi, %rdx
-; AVX512-NEXT:    setb %bpl
-; AVX512-NEXT:    cmpq %rcx, %rax
-; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %bpl
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    cmpq %rcx, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rax
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; AVX512-NEXT:    sbbq %rsi, %rax
-; AVX512-NEXT:    setb %r9b
-; AVX512-NEXT:    cmpq %rdx, %rcx
-; AVX512-NEXT:    sbbq %rdi, %rsi
-; AVX512-NEXT:    sbbb $0, %r9b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT:    cmpq %rdx, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; AVX512-NEXT:    movq %rdi, %rcx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rcx
-; AVX512-NEXT:    setb %cl
-; AVX512-NEXT:    cmpq %rsi, %rdx
-; AVX512-NEXT:    sbbq %rdi, %rax
-; AVX512-NEXT:    sbbb $0, %cl
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT:    cmpq %rsi, %rdi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
-; AVX512-NEXT:    movq %r8, %rdx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rdx
-; AVX512-NEXT:    setb %dl
-; AVX512-NEXT:    cmpq %rdi, %rsi
-; AVX512-NEXT:    sbbq %r8, %rax
-; AVX512-NEXT:    sbbb $0, %dl
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    cmpq %rdi, %r8
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
-; AVX512-NEXT:    movq %r10, %rsi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rsi
-; AVX512-NEXT:    setb %sil
-; AVX512-NEXT:    cmpq %r8, %rdi
-; AVX512-NEXT:    sbbq %r10, %rax
-; AVX512-NEXT:    sbbb $0, %sil
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    cmpq %r8, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
-; AVX512-NEXT:    movq %r11, %rdi
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %rdi
-; AVX512-NEXT:    setb %dil
-; AVX512-NEXT:    cmpq %r10, %r8
-; AVX512-NEXT:    sbbq %r11, %rax
-; AVX512-NEXT:    sbbb $0, %dil
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    cmpq %rax, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    movq %rbx, %r8
-; AVX512-NEXT:    movq (%rsp), %r11 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r11, %r8
-; AVX512-NEXT:    setb %r8b
-; AVX512-NEXT:    cmpq %r10, %rax
-; AVX512-NEXT:    sbbq %rbx, %r11
-; AVX512-NEXT:    sbbb $0, %r8b
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
-; AVX512-NEXT:    cmpq %rbx, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    movq %r14, %r10
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    sbbq %rax, %r10
-; AVX512-NEXT:    setb %r10b
-; AVX512-NEXT:    cmpq %r11, %rbx
-; AVX512-NEXT:    sbbq %r14, %rax
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    sbbb $0, %r10b
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
-; AVX512-NEXT:    cmpq %r15, %r11
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %rbx
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r14, %rbx
-; AVX512-NEXT:    setb %bl
-; AVX512-NEXT:    cmpq %r11, %r15
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; AVX512-NEXT:    sbbq %rax, %r14
-; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; AVX512-NEXT:    sbbb $0, %bl
-; AVX512-NEXT:    cmpq %r11, %r14
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; AVX512-NEXT:    movq %rax, %r15
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
-; AVX512-NEXT:    sbbq %r12, %r15
-; AVX512-NEXT:    setb %r15b
-; AVX512-NEXT:    cmpq %r14, %r11
-; AVX512-NEXT:    sbbq %rax, %r12
-; AVX512-NEXT:    sbbb $0, %r15b
-; AVX512-NEXT:    movzbl %r15b, %r11d
-; AVX512-NEXT:    andl $3, %r11d
-; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
-; AVX512-NEXT:    movb %r11b, 4(%r14)
-; AVX512-NEXT:    movzbl %bl, %r11d
-; AVX512-NEXT:    andl $3, %r11d
-; AVX512-NEXT:    movzbl %r10b, %r10d
-; AVX512-NEXT:    andl $3, %r10d
-; AVX512-NEXT:    leaq (%r10,%r11,4), %r10
-; AVX512-NEXT:    movzbl %r8b, %r8d
-; AVX512-NEXT:    andl $3, %r8d
-; AVX512-NEXT:    shll $4, %r8d
-; AVX512-NEXT:    orq %r10, %r8
-; AVX512-NEXT:    movzbl %dil, %edi
-; AVX512-NEXT:    andl $3, %edi
-; AVX512-NEXT:    shll $6, %edi
-; AVX512-NEXT:    orq %r8, %rdi
-; AVX512-NEXT:    movzbl %sil, %esi
-; AVX512-NEXT:    andl $3, %esi
-; AVX512-NEXT:    shll $8, %esi
-; AVX512-NEXT:    orq %rdi, %rsi
-; AVX512-NEXT:    movzbl %dl, %edx
-; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $10, %edx
-; AVX512-NEXT:    movzbl %cl, %ecx
-; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    shll $12, %ecx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    movzbl %r9b, %edx
-; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $14, %edx
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movzbl %bpl, %eax
-; AVX512-NEXT:    andl $3, %eax
-; AVX512-NEXT:    shll $16, %eax
-; AVX512-NEXT:    orq %rdx, %rax
-; AVX512-NEXT:    orq %rsi, %rax
-; AVX512-NEXT:    movzbl %r13b, %ecx
-; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    shll $18, %ecx
-; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $20, %edx
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    shll $22, %ecx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shll $24, %edx
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %ecx # 1-byte Folded Reload
-; AVX512-NEXT:    andl $3, %ecx
-; AVX512-NEXT:    shlq $26, %rcx
-; AVX512-NEXT:    orq %rdx, %rcx
-; AVX512-NEXT:    orq %rax, %rcx
-; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; AVX512-NEXT:    andl $3, %eax
-; AVX512-NEXT:    shlq $28, %rax
-; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
-; AVX512-NEXT:    andl $3, %edx
-; AVX512-NEXT:    shlq $30, %rdx
-; AVX512-NEXT:    orq %rax, %rdx
-; AVX512-NEXT:    orq %rcx, %rdx
-; AVX512-NEXT:    movq %r14, %rax
-; AVX512-NEXT:    movl %edx, (%r14)
-; AVX512-NEXT:    addq $88, %rsp
-; AVX512-NEXT:    popq %rbx
-; AVX512-NEXT:    popq %r12
-; AVX512-NEXT:    popq %r13
-; AVX512-NEXT:    popq %r14
-; AVX512-NEXT:    popq %r15
-; AVX512-NEXT:    popq %rbp
-; AVX512-NEXT:    retq
+; X64-LABEL: ucmp_uncommon_vectors:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbp
+; X64-NEXT:    pushq %r15
+; X64-NEXT:    pushq %r14
+; X64-NEXT:    pushq %r13
+; X64-NEXT:    pushq %r12
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    subq $120, %rsp
+; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    andl $127, %edx
+; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    andl $127, %r8d
+; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    andl $127, %r12d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    andl $127, %r14d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    andl $127, %ebx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; X64-NEXT:    andl $127, %r15d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; X64-NEXT:    andl $127, %ebp
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; X64-NEXT:    andl $127, %r11d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; X64-NEXT:    andl $127, %r13d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    andl $127, %r10d
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    andl $127, %esi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    andl $127, %edi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    andl $127, %eax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    andl $127, %edx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    cmpq %r9, %r8
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    sbbq %rax, %rcx
+; X64-NEXT:    setb %cl
+; X64-NEXT:    cmpq %r8, %r9
+; X64-NEXT:    sbbq %rdx, %rax
+; X64-NEXT:    sbbb $0, %cl
+; X64-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    movq %rdi, %rdx
+; X64-NEXT:    sbbq %rsi, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    sbbq %rdi, %rsi
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    movq %r10, %rdx
+; X64-NEXT:    sbbq %r13, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    sbbq %r10, %r13
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    movq %r11, %rdx
+; X64-NEXT:    sbbq %rbp, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    sbbq %r11, %rbp
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    movq %r15, %rdx
+; X64-NEXT:    sbbq %rbx, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    sbbq %r15, %rbx
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    cmpq %rax, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, %rdx
+; X64-NEXT:    sbbq %r14, %rdx
+; X64-NEXT:    setb %dl
+; X64-NEXT:    cmpq %rcx, %rax
+; X64-NEXT:    sbbq %rsi, %r14
+; X64-NEXT:    sbbb $0, %dl
+; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    cmpq %rcx, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    sbbq %r12, %rax
+; X64-NEXT:    setb %r13b
+; X64-NEXT:    cmpq %rdx, %rcx
+; X64-NEXT:    sbbq %rsi, %r12
+; X64-NEXT:    sbbb $0, %r13b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    cmpq %rdx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movq %rdi, %rcx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rcx
+; X64-NEXT:    setb %bpl
+; X64-NEXT:    cmpq %rsi, %rdx
+; X64-NEXT:    sbbq %rdi, %rax
+; X64-NEXT:    sbbb $0, %bpl
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    cmpq %rsi, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rdx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rdx
+; X64-NEXT:    setb %r11b
+; X64-NEXT:    cmpq %rdi, %rsi
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    sbbb $0, %r11b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    cmpq %rdi, %r8
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rsi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rsi
+; X64-NEXT:    setb %sil
+; X64-NEXT:    cmpq %r8, %rdi
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    sbbb $0, %sil
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    cmpq %r8, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %rdi
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rdi
+; X64-NEXT:    setb %dil
+; X64-NEXT:    cmpq %r9, %r8
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    sbbb $0, %dil
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    cmpq %r9, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %r8
+; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %r8
+; X64-NEXT:    setb %r8b
+; X64-NEXT:    cmpq %r10, %r9
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    sbbb $0, %r8b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    cmpq %r10, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    movq %rcx, %r9
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %r9
+; X64-NEXT:    setb %r9b
+; X64-NEXT:    cmpq %rbx, %r10
+; X64-NEXT:    sbbq %rcx, %rax
+; X64-NEXT:    sbbb $0, %r9b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    cmpq %rax, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %r10
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rcx, %r10
+; X64-NEXT:    setb %r10b
+; X64-NEXT:    cmpq %rbx, %rax
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    sbbb $0, %r10b
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    cmpq %rcx, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %rbx
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %rbx
+; X64-NEXT:    setb %bl
+; X64-NEXT:    cmpq %r14, %rcx
+; X64-NEXT:    sbbq %rdx, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; X64-NEXT:    sbbb $0, %bl
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    cmpq %rcx, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %r15
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; X64-NEXT:    sbbq %rax, %r15
+; X64-NEXT:    setb %r15b
+; X64-NEXT:    cmpq %r14, %rcx
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; X64-NEXT:    sbbq %rdx, %rax
+; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; X64-NEXT:    sbbb $0, %r15b
+; X64-NEXT:    cmpq %r12, %rax
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
+; X64-NEXT:    movq %rdx, %r14
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
+; X64-NEXT:    sbbq %rcx, %r14
+; X64-NEXT:    setb %r14b
+; X64-NEXT:    cmpq %rax, %r12
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm0
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm1
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm2
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm3
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm4
+; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; X64-NEXT:    movd %eax, %xmm5
+; X64-NEXT:    movzbl %r13b, %eax
+; X64-NEXT:    movd %eax, %xmm6
+; X64-NEXT:    movzbl %bpl, %eax
+; X64-NEXT:    movd %eax, %xmm7
+; X64-NEXT:    movzbl %r11b, %eax
+; X64-NEXT:    movd %eax, %xmm8
+; X64-NEXT:    movzbl %sil, %eax
+; X64-NEXT:    movd %eax, %xmm9
+; X64-NEXT:    movzbl %dil, %eax
+; X64-NEXT:    movd %eax, %xmm10
+; X64-NEXT:    movzbl %r8b, %eax
+; X64-NEXT:    movd %eax, %xmm11
+; X64-NEXT:    movzbl %r9b, %eax
+; X64-NEXT:    movd %eax, %xmm12
+; X64-NEXT:    movzbl %r10b, %eax
+; X64-NEXT:    movd %eax, %xmm13
+; X64-NEXT:    movzbl %bl, %eax
+; X64-NEXT:    movd %eax, %xmm14
+; X64-NEXT:    movzbl %r15b, %eax
+; X64-NEXT:    movd %eax, %xmm15
+; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; X64-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; X64-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; X64-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
+; X64-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0]
+; X64-NEXT:    sbbq %rdx, %rcx
+; X64-NEXT:    sbbb $0, %r14b
+; X64-NEXT:    andb $3, %r14b
+; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; X64-NEXT:    movb %r14b, 4(%rdi)
+; X64-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $3, %eax
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $3, %ecx
+; X64-NEXT:    leaq (%rcx,%rax,4), %rax
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $3, %ecx
+; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $3, %eax
+; X64-NEXT:    shll $6, %eax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $3, %ecx
+; X64-NEXT:    shll $8, %ecx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $3, %eax
+; X64-NEXT:    shll $10, %eax
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    andl $3, %edx
+; X64-NEXT:    shll $12, %edx
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; X64-NEXT:    andl $3, %esi
+; X64-NEXT:    shll $14, %esi
+; X64-NEXT:    orq %rdx, %rsi
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $3, %eax
+; X64-NEXT:    shll $16, %eax
+; X64-NEXT:    orq %rsi, %rax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $3, %ecx
+; X64-NEXT:    shll $18, %ecx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    andl $3, %edx
+; X64-NEXT:    shll $20, %edx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $3, %ecx
+; X64-NEXT:    shll $22, %ecx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    andl $3, %edx
+; X64-NEXT:    shll $24, %edx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; X64-NEXT:    andl $3, %ecx
+; X64-NEXT:    shlq $26, %rcx
+; X64-NEXT:    orq %rdx, %rcx
+; X64-NEXT:    orq %rax, %rcx
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
+; X64-NEXT:    andl $3, %eax
+; X64-NEXT:    shlq $28, %rax
+; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; X64-NEXT:    andl $3, %edx
+; X64-NEXT:    shlq $30, %rdx
+; X64-NEXT:    orq %rax, %rdx
+; X64-NEXT:    orq %rcx, %rdx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movl %edx, (%rdi)
+; X64-NEXT:    addq $120, %rsp
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    popq %r12
+; X64-NEXT:    popq %r13
+; X64-NEXT:    popq %r14
+; X64-NEXT:    popq %r15
+; X64-NEXT:    popq %rbp
+; X64-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_uncommon_vectors:
 ; X86:       # %bb.0:
@@ -3198,22 +2120,22 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %bl
 ; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl %esi, %edi
 ; X86-NEXT:    sbbl %edx, %edi
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %edi
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NEXT:    sbbl %ebx, %edi
 ; X86-NEXT:    movl $0, %edi
 ; X86-NEXT:    sbbl %edi, %edi
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl %ebp, {{[0-9]+}}(%esp)
 ; X86-NEXT:    sbbl %esi, %edx
-; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    sbbl %eax, %ebx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
@@ -3239,208 +2161,213 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    cmpl %eax, %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %edi
-; X86-NEXT:    sbbl %edx, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %ecx
+; X86-NEXT:    sbbl %esi, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %edi
+; X86-NEXT:    movl %ebp, %ecx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edi
-; X86-NEXT:    movl $0, %edi
-; X86-NEXT:    sbbl %edi, %edi
+; X86-NEXT:    sbbl %ebx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
 ; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    cmpl %edx, %eax
+; X86-NEXT:    sbbl %edi, %esi
 ; X86-NEXT:    sbbl %ebp, %ebx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %cl
 ; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %edx
-; X86-NEXT:    sbbl %esi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    sbbl %edi, %edx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; X86-NEXT:    movl %ebp, %edx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %edx
 ; X86-NEXT:    movl $0, %edx
 ; X86-NEXT:    sbbl %edx, %edx
 ; X86-NEXT:    setb %dl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    cmpl %esi, %eax
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %ebp, %ecx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %dl
 ; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl %ebp, %eax
+; X86-NEXT:    sbbl %ebx, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %eax
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    cmpl %eax, %ecx
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    sbbl %ebp, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %esi
+; X86-NEXT:    sbbl %esi, %esi
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebx
-; X86-NEXT:    sbbl %esi, %ebx
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    cmpl %esi, %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    sbbl %ebx, %ebp
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebx
-; X86-NEXT:    movl $0, %ebx
-; X86-NEXT:    sbbl %ebx, %ebx
-; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %ecx, %eax
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebp, %edx
+; X86-NEXT:    movl %edx, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebp
+; X86-NEXT:    movl $0, %ebp
+; X86-NEXT:    sbbl %ebp, %ebp
+; X86-NEXT:    setb {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
+; X86-NEXT:    cmpl %edi, %esi
+; X86-NEXT:    sbbl %eax, %ebx
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movb %bl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    sbbb $0, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Folded Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %ebp
-; X86-NEXT:    sbbl %esi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
 ; X86-NEXT:    movl %ebx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebp
+; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebp
 ; X86-NEXT:    movl $0, %ebp
 ; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %edi, %esi
-; X86-NEXT:    sbbl %ebx, %edx
+; X86-NEXT:    setb %dl
+; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %ebx, %edi
+; X86-NEXT:    sbbl %eax, %ecx
 ; X86-NEXT:    movl $0, %eax
 ; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    sbbb $0, %dl
+; X86-NEXT:    movb %dl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    cmpl %eax, %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    movl %ebp, %eax
-; X86-NEXT:    sbbl %edi, %eax
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl %ebp, %ebx
+; X86-NEXT:    sbbl %edi, %ebx
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %eax
-; X86-NEXT:    movl $0, %eax
-; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %ebx
+; X86-NEXT:    movl $0, %ebx
+; X86-NEXT:    sbbl %ebx, %ebx
 ; X86-NEXT:    setb %bl
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
+; X86-NEXT:    cmpl %esi, %eax
 ; X86-NEXT:    sbbl %ebp, %edi
-; X86-NEXT:    sbbl %ecx, %edx
-; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
 ; X86-NEXT:    sbbb $0, %bl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    sbbl %ebp, %eax
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    sbbl %edx, %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %bh
-; X86-NEXT:    cmpl %esi, {{[0-9]+}}(%esp)
-; X86-NEXT:    sbbl %ecx, %edi
-; X86-NEXT:    sbbl %eax, %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NEXT:    sbbl %ecx, %eax
+; X86-NEXT:    movl $0, %eax
+; X86-NEXT:    sbbl %eax, %eax
+; X86-NEXT:    setb %al
+; X86-NEXT:    cmpl %edi, {{[0-9]+}}(%esp)
+; X86-NEXT:    sbbl %esi, %ebp
+; X86-NEXT:    sbbl %edx, %ecx
 ; X86-NEXT:    movl $0, %ecx
 ; X86-NEXT:    sbbl %ecx, %ecx
-; X86-NEXT:    sbbb $0, %bh
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movzbl %al, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl %bl, %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    cmpl %eax, {{[0-9]+}}(%esp)
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %esi, %ebp
-; X86-NEXT:    sbbl %edi, %ebp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NEXT:    sbbl %eax, %ecx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NEXT:    movl %edx, %ebp
-; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %ebp
-; X86-NEXT:    movl $0, %ebp
-; X86-NEXT:    sbbl %ebp, %ebp
-; X86-NEXT:    setb %cl
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebp
-; X86-NEXT:    sbbl %esi, %edi
-; X86-NEXT:    sbbl %edx, %eax
+; X86-NEXT:    sbbl %edx, %ecx
+; X86-NEXT:    movl $0, %ecx
+; X86-NEXT:    sbbl %ecx, %ecx
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    setb %al
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    cmpl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NEXT:    sbbl {{[0-9]+}}(%esp), %ebx
+; X86-NEXT:    sbbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    sbbl %edx, %edx
+; X86-NEXT:    sbbb $0, %al
+; X86-NEXT:    movb %al, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NEXT:    sbbl %eax, %eax
-; X86-NEXT:    sbbb $0, %cl
-; X86-NEXT:    movzbl %cl, %ecx
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movb %cl, 4(%edi)
-; X86-NEXT:    movzbl %bh, %ebp
-; X86-NEXT:    movzbl %bl, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ebp
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    leal (%ecx,%ebp,4), %ecx
 ; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $4, %eax
-; X86-NEXT:    orl %ecx, %eax
-; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    shll $6, %ebx
-; X86-NEXT:    orl %eax, %ebx
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NEXT:    andl $3, %edx
+; X86-NEXT:    leal (%edx,%eax,4), %edx
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $4, %edi
+; X86-NEXT:    orl %edx, %edi
+; X86-NEXT:    andl $3, %ebp
+; X86-NEXT:    shll $6, %ebp
+; X86-NEXT:    orl %edi, %ebp
 ; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    shll $8, %esi
-; X86-NEXT:    orl %ebx, %esi
+; X86-NEXT:    orl %ebp, %esi
+; X86-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NEXT:    andl $3, %edx
 ; X86-NEXT:    shll $10, %edx
 ; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    andl $3, %ecx
+; X86-NEXT:    shll $12, %ecx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $12, %eax
+; X86-NEXT:    shll $14, %eax
+; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $14, %ecx
+; X86-NEXT:    shll $16, %ecx
 ; X86-NEXT:    orl %eax, %ecx
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %eax
-; X86-NEXT:    shll $16, %eax
-; X86-NEXT:    orl %ecx, %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %esi
 ; X86-NEXT:    shll $18, %esi
-; X86-NEXT:    orl %eax, %esi
-; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 1-byte Folded Reload
+; X86-NEXT:    orl %ecx, %esi
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
+; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 1-byte Folded Reload
 ; X86-NEXT:    andl $3, %eax
 ; X86-NEXT:    shll $20, %eax
 ; X86-NEXT:    orl %esi, %eax
@@ -3449,22 +2376,24 @@ define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
 ; X86-NEXT:    orl %edx, %eax
 ; X86-NEXT:    andl $3, %ecx
 ; X86-NEXT:    shll $22, %ecx
+; X86-NEXT:    andl $3, %edi
+; X86-NEXT:    shll $24, %edi
+; X86-NEXT:    orl %ecx, %edi
 ; X86-NEXT:    andl $3, %esi
-; X86-NEXT:    shll $24, %esi
-; X86-NEXT:    orl %ecx, %esi
+; X86-NEXT:    shll $26, %esi
+; X86-NEXT:    orl %edi, %esi
 ; X86-NEXT:    andl $3, %ebx
-; X86-NEXT:    shll $26, %ebx
+; X86-NEXT:    shll $28, %ebx
 ; X86-NEXT:    orl %esi, %ebx
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 1-byte Folded Reload
-; X86-NEXT:    andl $3, %ecx
-; X86-NEXT:    shll $28, %ecx
+; X86-NEXT:    shll $30, %ecx
 ; X86-NEXT:    orl %ebx, %ecx
+; X86-NEXT:    orl %eax, %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    movzbl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 1-byte Folded Reload
-; X86-NEXT:    shll $30, %edx
-; X86-NEXT:    orl %ecx, %edx
-; X86-NEXT:    orl %eax, %edx
-; X86-NEXT:    movl %edx, (%edi)
-; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    andb $3, %dl
+; X86-NEXT:    movb %dl, 4(%eax)
+; X86-NEXT:    movl %ecx, (%eax)
 ; X86-NEXT:    addl $132, %esp
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    popl %edi

>From f95943e9c8a7030ca44812d7c2971e6afdcfb52c Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Fri, 13 Sep 2024 18:29:16 -0700
Subject: [PATCH 39/56] Changes with a higher instruction count

---
 llvm/test/CodeGen/X86/bitselect.ll | 30 ++++++++++++++++++++----------
 llvm/test/CodeGen/X86/bool-math.ll | 15 ++++++++-------
 llvm/test/CodeGen/X86/subcarry.ll  | 19 +++++++++++--------
 llvm/test/CodeGen/X86/xor-icmp.ll  | 15 ++++++++++-----
 4 files changed, 49 insertions(+), 30 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bitselect.ll b/llvm/test/CodeGen/X86/bitselect.ll
index 2922113b14ea90..347cdf39524479 100644
--- a/llvm/test/CodeGen/X86/bitselect.ll
+++ b/llvm/test/CodeGen/X86/bitselect.ll
@@ -1,7 +1,7 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc < %s -mtriple=i686-unknown-unknown | FileCheck %s --check-prefixes=X86
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64,X64-NOBMI
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64,X64-BMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=-bmi | FileCheck %s --check-prefixes=X64-NOBMI
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mattr=+bmi | FileCheck %s --check-prefixes=X64-BMI
 
 ; PR46472
 ; bitselect(a,b,m) == or(and(a,not(m)),and(b,m))
@@ -17,14 +17,24 @@ define i8 @bitselect_i8(i8 %a, i8 %b, i8 %m) nounwind {
 ; X86-NEXT:    xorb %cl, %al
 ; X86-NEXT:    retl
 ;
-; X64-LABEL: bitselect_i8:
-; X64:       # %bb.0:
-; X64-NEXT:    andl %edx, %esi
-; X64-NEXT:    movl %edx, %eax
-; X64-NEXT:    notb %al
-; X64-NEXT:    andb %dil, %al
-; X64-NEXT:    orb %sil, %al
-; X64-NEXT:    retq
+; X64-NOBMI-LABEL: bitselect_i8:
+; X64-NOBMI:       # %bb.0:
+; X64-NOBMI-NEXT:    movl %esi, %eax
+; X64-NOBMI-NEXT:    xorb %dil, %al
+; X64-NOBMI-NEXT:    andb %dl, %al
+; X64-NOBMI-NEXT:    xorb %dil, %al
+; X64-NOBMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-NOBMI-NEXT:    retq
+;
+; X64-BMI-LABEL: bitselect_i8:
+; X64-BMI:       # %bb.0:
+; X64-BMI-NEXT:    movl %edx, %eax
+; X64-BMI-NEXT:    andb %al, %sil
+; X64-BMI-NEXT:    notb %al
+; X64-BMI-NEXT:    andb %dil, %al
+; X64-BMI-NEXT:    orb %sil, %al
+; X64-BMI-NEXT:    # kill: def $al killed $al killed $eax
+; X64-BMI-NEXT:    retq
   %not = xor i8 %m, -1
   %ma = and i8 %a, %not
   %mb = and i8 %b, %m
diff --git a/llvm/test/CodeGen/X86/bool-math.ll b/llvm/test/CodeGen/X86/bool-math.ll
index b73af677bc6cdd..dd8511f3a81524 100644
--- a/llvm/test/CodeGen/X86/bool-math.ll
+++ b/llvm/test/CodeGen/X86/bool-math.ll
@@ -262,19 +262,20 @@ define i8 @low_bit_select_constants_bigger_true_narrower_result(i16 %x) {
 define i1 @opaque_constant(i48 %x, i48 %y) {
 ; X64-LABEL: opaque_constant:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    xorq %rsi, %rax
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq $32, %rdi
 ; X64-NEXT:    shrq $32, %rax
-; X64-NEXT:    andl $1, %eax
+; X64-NEXT:    xorb %dil, %al
+; X64-NEXT:    andb $1, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $rax
 ; X64-NEXT:    retq
 ;
 ; X32-LABEL: opaque_constant:
 ; X32:       # %bb.0:
-; X32-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    xorl {{[0-9]+}}(%esp), %eax
-; X32-NEXT:    andl $1, %eax
-; X32-NEXT:    # kill: def $al killed $al killed $eax
+; X32-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X32-NEXT:    movl $1, %ecx
+; X32-NEXT:    xorb {{[0-9]+}}(%esp), %al
+; X32-NEXT:    andb %cl, %al
 ; X32-NEXT:    retl
   %andx = and i48 %x, 4294967296
   %andy = and i48 %y, 4294967296
diff --git a/llvm/test/CodeGen/X86/subcarry.ll b/llvm/test/CodeGen/X86/subcarry.ll
index 1e9db9f55a8d5c..1570a3b41b2053 100644
--- a/llvm/test/CodeGen/X86/subcarry.ll
+++ b/llvm/test/CodeGen/X86/subcarry.ll
@@ -46,13 +46,16 @@ define %S @negate(ptr nocapture readonly %this) {
 ; CHECK-NEXT:    subq (%rsi), %rdx
 ; CHECK-NEXT:    movl $0, %edi
 ; CHECK-NEXT:    sbbq 8(%rsi), %rdi
-; CHECK-NEXT:    movl $0, %r8d
-; CHECK-NEXT:    sbbq 16(%rsi), %r8
-; CHECK-NEXT:    sbbq 24(%rsi), %rcx
+; CHECK-NEXT:    sbbq 16(%rsi), %rcx
+; CHECK-NEXT:    setae %r8b
+; CHECK-NEXT:    movzbl %r8b, %r8d
+; CHECK-NEXT:    movq 24(%rsi), %rsi
+; CHECK-NEXT:    notq %rsi
+; CHECK-NEXT:    addq %r8, %rsi
 ; CHECK-NEXT:    movq %rdx, (%rax)
 ; CHECK-NEXT:    movq %rdi, 8(%rax)
-; CHECK-NEXT:    movq %r8, 16(%rax)
-; CHECK-NEXT:    movq %rcx, 24(%rax)
+; CHECK-NEXT:    movq %rcx, 16(%rax)
+; CHECK-NEXT:    movq %rsi, 24(%rax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = load i64, ptr %this, align 8
@@ -108,13 +111,13 @@ define %S @sub(ptr nocapture readonly %this, %S %arg.b) {
 ; CHECK-NEXT:    movzbl %r10b, %r10d
 ; CHECK-NEXT:    notq %r8
 ; CHECK-NEXT:    addq %rdx, %r8
-; CHECK-NEXT:    adcq 24(%rsi), %r10
 ; CHECK-NEXT:    notq %r9
-; CHECK-NEXT:    addq %r10, %r9
+; CHECK-NEXT:    adcq 24(%rsi), %r10
+; CHECK-NEXT:    addq %r9, %r10
 ; CHECK-NEXT:    movq %rdi, (%rax)
 ; CHECK-NEXT:    movq %rcx, 8(%rax)
 ; CHECK-NEXT:    movq %r8, 16(%rax)
-; CHECK-NEXT:    movq %r9, 24(%rax)
+; CHECK-NEXT:    movq %r10, 24(%rax)
 ; CHECK-NEXT:    retq
 entry:
   %0 = extractvalue %S %arg.b, 0
diff --git a/llvm/test/CodeGen/X86/xor-icmp.ll b/llvm/test/CodeGen/X86/xor-icmp.ll
index 16a3b6cb855a7d..59db27ef5e6dd1 100644
--- a/llvm/test/CodeGen/X86/xor-icmp.ll
+++ b/llvm/test/CodeGen/X86/xor-icmp.ll
@@ -15,11 +15,16 @@ define i32 @t(i32 %a, i32 %b) nounwind ssp {
 ;
 ; X64-LABEL: t:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    xorl %esi, %edi
+; X64-NEXT:    shrl $14, %edi
+; X64-NEXT:    andb $1, %dil
+; X64-NEXT:    btl $14, %esi
+; X64-NEXT:    sbbb $0, %dil
+; X64-NEXT:    je .LBB0_1
+; X64-NEXT:  # %bb.2: # %bb1
+; X64-NEXT:    xorl %eax, %eax
+; X64-NEXT:    jmp bar # TAILCALL
+; X64-NEXT:  .LBB0_1: # %bb
 ; X64-NEXT:    xorl %eax, %eax
-; X64-NEXT:    testl $16384, %edi # imm = 0x4000
-; X64-NEXT:    jne bar # TAILCALL
-; X64-NEXT:  # %bb.1: # %bb
 ; X64-NEXT:    jmp foo # TAILCALL
 entry:
   %0 = and i32 %a, 16384
@@ -96,7 +101,7 @@ define i1 @xor_not_bools(i1 zeroext %x, i1 zeroext %y) nounwind {
 ; X64-LABEL: xor_not_bools:
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    xorl %esi, %eax
+; X64-NEXT:    xorb %sil, %al
 ; X64-NEXT:    xorb $1, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq

>From 688c98378d605219c5506e4ba0296a4b9abc305f Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:10:18 -0700
Subject: [PATCH 40/56] Non-trivial diff

---
 llvm/test/CodeGen/X86/pr32329.ll | 95 ++++++++++++++++++--------------
 1 file changed, 54 insertions(+), 41 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr32329.ll b/llvm/test/CodeGen/X86/pr32329.ll
index d9671aa04f4603..4a4082a2a59ed4 100644
--- a/llvm/test/CodeGen/X86/pr32329.ll
+++ b/llvm/test/CodeGen/X86/pr32329.ll
@@ -29,33 +29,40 @@ define void @foo() local_unnamed_addr {
 ; X86-NEXT:    .cfi_offset %edi, -16
 ; X86-NEXT:    .cfi_offset %ebx, -12
 ; X86-NEXT:    .cfi_offset %ebp, -8
-; X86-NEXT:    movsbl var_27, %eax
-; X86-NEXT:    movzwl var_2, %ebx
-; X86-NEXT:    movl var_310, %ecx
-; X86-NEXT:    imull %eax, %ecx
-; X86-NEXT:    addl var_24, %ecx
-; X86-NEXT:    movl $4194303, %esi # imm = 0x3FFFFF
-; X86-NEXT:    andl obj, %esi
+; X86-NEXT:    movl obj, %esi
 ; X86-NEXT:    leal (%esi,%esi), %edx
-; X86-NEXT:    subl %eax, %edx
-; X86-NEXT:    movl %edx, %edi
-; X86-NEXT:    subl %ebx, %edi
-; X86-NEXT:    imull %edi, %ecx
+; X86-NEXT:    movzwl var_2, %eax
+; X86-NEXT:    movb var_310, %ch
+; X86-NEXT:    movsbl var_27, %ebx
+; X86-NEXT:    subb %bl, %dl
+; X86-NEXT:    movb %dl, %cl
+; X86-NEXT:    subb %al, %cl
+; X86-NEXT:    movl %eax, %ebp
+; X86-NEXT:    movb %ch, %al
+; X86-NEXT:    mulb %bl
+; X86-NEXT:    addb var_24, %al
+; X86-NEXT:    mulb %cl
+; X86-NEXT:    movl %eax, %ecx
 ; X86-NEXT:    addb $113, %cl
-; X86-NEXT:    movl $9, %ebx
-; X86-NEXT:    xorl %ebp, %ebp
-; X86-NEXT:    shldl %cl, %ebx, %ebp
-; X86-NEXT:    shll %cl, %ebx
+; X86-NEXT:    movl $9, %eax
+; X86-NEXT:    xorl %edi, %edi
+; X86-NEXT:    shldl %cl, %eax, %edi
+; X86-NEXT:    shll %cl, %eax
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    cmovnel %ebx, %ebp
+; X86-NEXT:    cmovnel %eax, %edi
 ; X86-NEXT:    movl $0, %ecx
-; X86-NEXT:    cmovnel %ecx, %ebx
-; X86-NEXT:    cmpl %esi, %edi
-; X86-NEXT:    movl %ebp, var_50+4
-; X86-NEXT:    movl %ebx, var_50
+; X86-NEXT:    cmovnel %ecx, %eax
+; X86-NEXT:    andl $4194303, %esi # imm = 0x3FFFFF
+; X86-NEXT:    leal (%esi,%esi), %ecx
+; X86-NEXT:    subl %ebx, %ecx
+; X86-NEXT:    subl %ebp, %ecx
+; X86-NEXT:    cmpl %esi, %ecx
+; X86-NEXT:    movl %edi, var_50+4
+; X86-NEXT:    movl %eax, var_50
 ; X86-NEXT:    setge var_205
-; X86-NEXT:    imull %eax, %edx
-; X86-NEXT:    movb %dl, var_218
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    mulb %bl
+; X86-NEXT:    movb %al, var_218
 ; X86-NEXT:    popl %esi
 ; X86-NEXT:    .cfi_def_cfa_offset 16
 ; X86-NEXT:    popl %edi
@@ -68,27 +75,33 @@ define void @foo() local_unnamed_addr {
 ;
 ; X64-LABEL: foo:
 ; X64:       # %bb.0: # %entry
-; X64-NEXT:    movsbl var_27(%rip), %eax
-; X64-NEXT:    movzwl var_2(%rip), %edx
-; X64-NEXT:    movl var_310(%rip), %ecx
-; X64-NEXT:    imull %eax, %ecx
-; X64-NEXT:    addl var_24(%rip), %ecx
-; X64-NEXT:    movl $4194303, %esi # imm = 0x3FFFFF
-; X64-NEXT:    andl obj(%rip), %esi
-; X64-NEXT:    leal (%rsi,%rsi), %edi
-; X64-NEXT:    subl %eax, %edi
-; X64-NEXT:    movl %edi, %r8d
-; X64-NEXT:    subl %edx, %r8d
-; X64-NEXT:    imull %r8d, %ecx
-; X64-NEXT:    addb $113, %cl
-; X64-NEXT:    movl $9, %edx
+; X64-NEXT:    movl obj(%rip), %esi
+; X64-NEXT:    leal (%rsi,%rsi), %edx
+; X64-NEXT:    movzwl var_2(%rip), %ecx
+; X64-NEXT:    movzwl %cx, %r8d
+; X64-NEXT:    movzbl var_310(%rip), %eax
+; X64-NEXT:    movsbl var_27(%rip), %edi
+; X64-NEXT:    subb %dil, %dl
+; X64-NEXT:    movl %edx, %r9d
+; X64-NEXT:    subb %cl, %r9b
+; X64-NEXT:    mulb %dil
+; X64-NEXT:    addb var_24(%rip), %al
+; X64-NEXT:    mulb %r9b
+; X64-NEXT:    # kill: def $al killed $al def $rax
+; X64-NEXT:    leal 113(%rax), %ecx
+; X64-NEXT:    movl $9, %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    shlq %cl, %rdx
-; X64-NEXT:    movq %rdx, var_50(%rip)
-; X64-NEXT:    cmpl %esi, %r8d
+; X64-NEXT:    shlq %cl, %rax
+; X64-NEXT:    movq %rax, var_50(%rip)
+; X64-NEXT:    andl $4194303, %esi # imm = 0x3FFFFF
+; X64-NEXT:    leal (%rsi,%rsi), %eax
+; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subl %r8d, %eax
+; X64-NEXT:    cmpl %esi, %eax
 ; X64-NEXT:    setge var_205(%rip)
-; X64-NEXT:    imull %eax, %edi
-; X64-NEXT:    movb %dil, var_218(%rip)
+; X64-NEXT:    movl %edx, %eax
+; X64-NEXT:    mulb %dil
+; X64-NEXT:    movb %al, var_218(%rip)
 ; X64-NEXT:    retq
   entry:
   %bf.load = load i32, ptr @obj, align 8

>From e9c95eac258dac5f186e3faa159fc07aef6d3707 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:25:06 -0700
Subject: [PATCH 41/56] Prefer readable condition

---
 llvm/lib/Target/X86/X86ISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index f07d5c43209b52..0e6be2068aa713 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -34901,7 +34901,7 @@ bool X86TargetLowering::isFMAFasterThanFMulAndFAdd(const MachineFunction &MF,
 bool X86TargetLowering::isNarrowingProfitable(SDNode *N, EVT SrcVT,
                                               EVT DestVT) const {
   // i16 instructions are longer (0x66 prefix) and potentially slower.
-  if (SrcVT.isVector() || DestVT.isVector())
+  if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
     return false;
   return !(SrcVT == MVT::i32 && DestVT == MVT::i16);
 }

>From 8aac05adbdc4a0acd19ba45df91ebe952b7d2a39 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:42:14 -0700
Subject: [PATCH 42/56] Update 2 X86 tests that I missed

---
 llvm/test/CodeGen/X86/vec_saddo.ll | 8 ++------
 llvm/test/CodeGen/X86/vec_uaddo.ll | 8 ++------
 2 files changed, 4 insertions(+), 12 deletions(-)

diff --git a/llvm/test/CodeGen/X86/vec_saddo.ll b/llvm/test/CodeGen/X86/vec_saddo.ll
index 0aea89e4acf018..460c5fe11f82a5 100644
--- a/llvm/test/CodeGen/X86/vec_saddo.ll
+++ b/llvm/test/CodeGen/X86/vec_saddo.ll
@@ -1045,16 +1045,12 @@ define <4 x i32> @saddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: saddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
-; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/vec_uaddo.ll b/llvm/test/CodeGen/X86/vec_uaddo.ll
index 34e2811cb9bb5c..bac118095331ca 100644
--- a/llvm/test/CodeGen/X86/vec_uaddo.ll
+++ b/llvm/test/CodeGen/X86/vec_uaddo.ll
@@ -1098,16 +1098,12 @@ define <4 x i32> @uaddo_v4i1(<4 x i1> %a0, <4 x i1> %a1, ptr %p2) nounwind {
 ;
 ; AVX512-LABEL: uaddo_v4i1:
 ; AVX512:       # %bb.0:
-; AVX512-NEXT:    vpslld $31, %xmm0, %xmm2
+; AVX512-NEXT:    vpxor %xmm1, %xmm0, %xmm2
+; AVX512-NEXT:    vpslld $31, %xmm2, %xmm2
 ; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k0
-; AVX512-NEXT:    vpslld $31, %xmm1, %xmm2
-; AVX512-NEXT:    vptestmd %xmm2, %xmm2, %k1
-; AVX512-NEXT:    kxorw %k1, %k0, %k0
 ; AVX512-NEXT:    vpand %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    vpslld $31, %xmm0, %xmm0
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm0
-; AVX512-NEXT:    kshiftlw $12, %k0, %k0
-; AVX512-NEXT:    kshiftrw $12, %k0, %k0
 ; AVX512-NEXT:    kmovd %k0, %eax
 ; AVX512-NEXT:    movb %al, (%rdi)
 ; AVX512-NEXT:    retq

>From af214c6b06b6717426c3ef258c11b7bfc9f5d5ce Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 15:43:05 -0700
Subject: [PATCH 43/56] Update register names for NVPTX/boolean-patterns.ll

---
 llvm/test/CodeGen/NVPTX/boolean-patterns.ll | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
index c30ebccdf9199c..fc3a48067a4c8a 100644
--- a/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
+++ b/llvm/test/CodeGen/NVPTX/boolean-patterns.ll
@@ -43,14 +43,14 @@ define i1 @m2and_ri(i1 %a) {
 define i1 @select2or(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2or(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [select2or_param_0];
 ; CHECK-NEXT:    ld.param.u8 %rs2, [select2or_param_1];
-; CHECK-NEXT:    or.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    or.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;
@@ -61,14 +61,14 @@ define i1 @select2or(i1 %a, i1 %b) {
 define i1 @select2and(i1 %a, i1 %b) {
 ; CHECK-LABEL: select2and(
 ; CHECK:       {
-; CHECK-NEXT:    .reg .b16 %rs<4>;
+; CHECK-NEXT:    .reg .b16 %rs<5>;
 ; CHECK-NEXT:    .reg .b32 %r<3>;
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  // %bb.0:
 ; CHECK-NEXT:    ld.param.u8 %rs1, [select2and_param_0];
 ; CHECK-NEXT:    ld.param.u8 %rs2, [select2and_param_1];
-; CHECK-NEXT:    and.b16 %rs3, %rs1, %rs2;
-; CHECK-NEXT:    cvt.u32.u16 %r1, %rs3;
+; CHECK-NEXT:    and.b16 %rs4, %rs1, %rs2;
+; CHECK-NEXT:    cvt.u32.u16 %r1, %rs4;
 ; CHECK-NEXT:    and.b32 %r2, %r1, 1;
 ; CHECK-NEXT:    st.param.b32 [func_retval0+0], %r2;
 ; CHECK-NEXT:    ret;

>From e30278076cbeac05afa2d2aa54598dc89e7e5a2e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Sun, 15 Sep 2024 19:02:23 -0700
Subject: [PATCH 44/56] Move isTypeLegal condition up

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 990c0d57660752..9c532c44f182c4 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5903,6 +5903,9 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
 
   // logic_op (truncate x), (truncate y) --> truncate (logic_op x, y)
   if (HandOpcode == ISD::TRUNCATE) {
+    // Don't create a logic op on an illegal type.
+    if (!TLI.isTypeLegal(XVT))
+      return SDValue();
     // If both operands have other uses, this transform would create extra
     // instructions without eliminating anything.
     if (!N0.hasOneUse() && !N1.hasOneUse())
@@ -5921,9 +5924,6 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     // transformation.
     if (TLI.isNarrowingProfitable(XVT, VT))
       return SDValue();
-    // Don't create a logic op on an illegal type.
-    if (!TLI.isTypeLegal(XVT))
-      return SDValue();
     SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
     return DAG.getNode(HandOpcode, DL, VT, Logic);
   }

>From 1e7ebe3bd55ce93f3c8003486a65d10e33d585f3 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Wed, 18 Sep 2024 19:53:11 -0700
Subject: [PATCH 45/56] Update AMDGPU/computeNumSignBits-mul.ll

---
 llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
index 11795cca18daa2..07a4478241b186 100644
--- a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
@@ -5,9 +5,9 @@ define i16 @num_sign_bits_mul_i48_0(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 ; GFX9-LABEL: num_sign_bits_mul_i48_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mul_i32_i24_sdwa v1, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v1
+; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %A = sext i8 %X to i48
   %B = sext i8 %Y to i48
@@ -24,8 +24,8 @@ define i16 @num_sign_bits_mul_i48_1(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 ; GFX9-LABEL: num_sign_bits_mul_i48_1:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
 ; GFX9-NEXT:    v_mul_hi_i32_i24_e32 v1, v0, v2
 ; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]

>From 24395306c21d4ec9bb73a40cac6328dc5ab88f9a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 19 Sep 2024 19:19:14 -0700
Subject: [PATCH 46/56] Update X86 bit_ceil.ll & scmp.ll

---
 llvm/test/CodeGen/X86/bit_ceil.ll | 16 ++++++++--------
 llvm/test/CodeGen/X86/scmp.ll     |  2 +-
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/llvm/test/CodeGen/X86/bit_ceil.ll b/llvm/test/CodeGen/X86/bit_ceil.ll
index 823453087f6180..13ad9b98ebcc4f 100644
--- a/llvm/test/CodeGen/X86/bit_ceil.ll
+++ b/llvm/test/CodeGen/X86/bit_ceil.ll
@@ -13,8 +13,8 @@ define i32 @bit_ceil_i32(i32 %x) {
 ; NOBMI-NEXT:    bsrl %eax, %eax
 ; NOBMI-NEXT:    movl $63, %ecx
 ; NOBMI-NEXT:    cmovnel %eax, %ecx
-; NOBMI-NEXT:    xorl $31, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-32, %cl
+; NOBMI-NEXT:    addb $33, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -50,8 +50,8 @@ define i32 @bit_ceil_i32_plus1(i32 noundef %x) {
 ; NOBMI-NEXT:    bsrl %edi, %eax
 ; NOBMI-NEXT:    movl $63, %ecx
 ; NOBMI-NEXT:    cmovnel %eax, %ecx
-; NOBMI-NEXT:    xorl $31, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-32, %cl
+; NOBMI-NEXT:    addb $33, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -89,8 +89,8 @@ define i64 @bit_ceil_i64(i64 %x) {
 ; NOBMI-NEXT:    bsrq %rax, %rax
 ; NOBMI-NEXT:    movl $127, %ecx
 ; NOBMI-NEXT:    cmovneq %rax, %rcx
-; NOBMI-NEXT:    xorl $63, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-64, %cl
+; NOBMI-NEXT:    addb $65, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
@@ -125,8 +125,8 @@ define i64 @bit_ceil_i64_plus1(i64 noundef %x) {
 ; NOBMI-NEXT:    bsrq %rdi, %rax
 ; NOBMI-NEXT:    movl $127, %ecx
 ; NOBMI-NEXT:    cmovneq %rax, %rcx
-; NOBMI-NEXT:    xorl $63, %ecx
-; NOBMI-NEXT:    negb %cl
+; NOBMI-NEXT:    xorb $-64, %cl
+; NOBMI-NEXT:    addb $65, %cl
 ; NOBMI-NEXT:    movl $1, %edx
 ; NOBMI-NEXT:    movl $1, %eax
 ; NOBMI-NEXT:    # kill: def $cl killed $cl killed $rcx
diff --git a/llvm/test/CodeGen/X86/scmp.ll b/llvm/test/CodeGen/X86/scmp.ll
index 0746a07d2cdf26..4ec3067397a669 100644
--- a/llvm/test/CodeGen/X86/scmp.ll
+++ b/llvm/test/CodeGen/X86/scmp.ll
@@ -2435,7 +2435,7 @@ define <7 x i117> @scmp_uncommon_vectors(<7 x i7> %x, <7 x i7> %y) nounwind {
 ; X86-NEXT:    movl %edi, 28(%ebx)
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    shrl $18, %eax
-; X86-NEXT:    andl $7, %eax
+; X86-NEXT:    andb $7, %al
 ; X86-NEXT:    movb %al, 102(%ebx)
 ; X86-NEXT:    movl %ebx, %eax
 ; X86-NEXT:    addl $52, %esp

>From f59a28754f26ac03ec64f6b24aba016db940ff4f Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Thu, 19 Sep 2024 19:25:14 -0700
Subject: [PATCH 47/56] Address comment

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9c532c44f182c4..34e0b08f79d6d9 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5920,7 +5920,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
     // widening a binop.
     if (TLI.isZExtFree(VT, XVT) && TLI.isTruncateFree(XVT, VT))
       return SDValue();
-    // Prevent an infinite loop if the target preferts the inverse
+    // Prevent an infinite loop if the target prefers the inverse
     // transformation.
     if (TLI.isNarrowingProfitable(XVT, VT))
       return SDValue();

>From 9ff8a60adcf6f1403095f83fade248ee74e043ee Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 23 Sep 2024 19:38:06 -0700
Subject: [PATCH 48/56] Prevent infinite loop by checking
 TLI.IsDesirableToPromoteOp()

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  5 +-
 llvm/test/CodeGen/X86/mul-constant-i8.ll      | 61 +++++++++++--------
 .../CodeGen/X86/srem-seteq-illegal-types.ll   |  2 +-
 llvm/test/CodeGen/X86/urem-i8-constant.ll     |  5 +-
 .../CodeGen/X86/urem-seteq-illegal-types.ll   | 14 +++--
 5 files changed, 51 insertions(+), 36 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 34e0b08f79d6d9..9a8da33ecc3a3b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15546,7 +15546,10 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
         break;
       SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
       SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
-      return DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+      SDValue TruncatedOp = DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+      if (TLI.(TruncatedOp, SrcVT))
+        break;
+      return TruncatedOp;
     }
   }
 
diff --git a/llvm/test/CodeGen/X86/mul-constant-i8.ll b/llvm/test/CodeGen/X86/mul-constant-i8.ll
index a4fa1ee8c00291..c972805e071c6e 100644
--- a/llvm/test/CodeGen/X86/mul-constant-i8.ll
+++ b/llvm/test/CodeGen/X86/mul-constant-i8.ll
@@ -72,7 +72,7 @@ define i8 @test_mul_by_7(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (,%rdi,8), %eax
-; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 7
@@ -118,7 +118,8 @@ define i8 @test_mul_by_11(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rdi,%rax,2), %eax
+; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 11
@@ -142,7 +143,8 @@ define i8 @test_mul_by_13(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    shll $2, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 13
@@ -155,7 +157,7 @@ define i8 @test_mul_by_14(i8 %x) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    leal (%rax,%rax), %ecx
 ; X64-NEXT:    shll $4, %eax
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $rax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 14
@@ -188,6 +190,7 @@ define i8 @test_mul_by_16(i8 %x) {
 define i8 @test_mul_by_17(i8 %x) {
 ; X64-LABEL: test_mul_by_17:
 ; X64:       # %bb.0:
+; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shll $4, %eax
 ; X64-NEXT:    addl %edi, %eax
@@ -214,7 +217,8 @@ define i8 @test_mul_by_19(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    leal (%rdi,%rax,2), %eax
+; X64-NEXT:    addl %eax, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 19
@@ -238,7 +242,8 @@ define i8 @test_mul_by_21(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    shll $2, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 21
@@ -249,9 +254,10 @@ define i8 @test_mul_by_22(i8 %x) {
 ; X64-LABEL: test_mul_by_22:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    leal (%rdi,%rdi,4), %ecx
+; X64-NEXT:    shll $2, %ecx
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    addb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 22
@@ -264,7 +270,7 @@ define i8 @test_mul_by_23(i8 %x) {
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,2), %eax
 ; X64-NEXT:    shll $3, %eax
-; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 23
@@ -301,7 +307,7 @@ define i8 @test_mul_by_26(i8 %x) {
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,4), %eax
 ; X64-NEXT:    leal (%rax,%rax,4), %eax
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 26
@@ -326,7 +332,7 @@ define i8 @test_mul_by_28(i8 %x) {
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
 ; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 28
@@ -338,9 +344,9 @@ define i8 @test_mul_by_29(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    leal (%rax,%rax,2), %eax
-; X64-NEXT:    addl %edi, %eax
-; X64-NEXT:    addl %edi, %eax
+; X64-NEXT:    leal (%rax,%rax,2), %ecx
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    addb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 29
@@ -353,7 +359,7 @@ define i8 @test_mul_by_30(i8 %x) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    leal (%rax,%rax), %ecx
 ; X64-NEXT:    shll $5, %eax
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $rax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 30
@@ -365,7 +371,7 @@ define i8 @test_mul_by_31(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    shll $5, %eax
-; X64-NEXT:    subl %edi, %eax
+; X64-NEXT:    subb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 31
@@ -388,7 +394,8 @@ define i8 @test_mul_by_37(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
+; X64-NEXT:    shll $2, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 37
@@ -400,7 +407,8 @@ define i8 @test_mul_by_41(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    leal (%rdi,%rax,8), %eax
+; X64-NEXT:    shll $3, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 41
@@ -413,7 +421,7 @@ define i8 @test_mul_by_62(i8 %x) {
 ; X64-NEXT:    movl %edi, %eax
 ; X64-NEXT:    leal (%rax,%rax), %ecx
 ; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    subl %ecx, %eax
+; X64-NEXT:    subb %cl, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $rax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 62
@@ -424,9 +432,9 @@ define i8 @test_mul_by_66(i8 %x) {
 ; X64-LABEL: test_mul_by_66:
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    leal (%rax,%rdi,2), %eax
+; X64-NEXT:    leal (%rdi,%rdi), %eax
+; X64-NEXT:    shll $6, %edi
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 66
@@ -438,7 +446,8 @@ define i8 @test_mul_by_73(i8 %x) {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    leal (%rdi,%rax,8), %eax
+; X64-NEXT:    shll $3, %eax
+; X64-NEXT:    addb %dil, %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, 73
@@ -462,7 +471,7 @@ define i8 @test_mul_by_neg10(i8 %x) {
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    addl %edi, %edi
 ; X64-NEXT:    leal (%rdi,%rdi,4), %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    negb %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, -10
@@ -475,7 +484,7 @@ define i8 @test_mul_by_neg36(i8 %x) {
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    shll $2, %edi
 ; X64-NEXT:    leal (%rdi,%rdi,8), %eax
-; X64-NEXT:    negl %eax
+; X64-NEXT:    negb %al
 ; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
   %m = mul i8 %x, -36
diff --git a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
index cc4bda81bef527..392beb72c7bfda 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-illegal-types.ll
@@ -380,7 +380,7 @@ define <3 x i1> @test_srem_vec(<3 x i33> %X) nounwind {
 ; AVX2-NEXT:    vpxor %ymm1, %ymm0, %ymm1
 ; AVX2-NEXT:    vextracti128 $1, %ymm1, %xmm2
 ; AVX2-NEXT:    vmovd %xmm0, %eax
-; AVX2-NEXT:    notl %eax
+; AVX2-NEXT:    notb %al
 ; AVX2-NEXT:    vpextrb $8, %xmm1, %edx
 ; AVX2-NEXT:    vpextrb $0, %xmm2, %ecx
 ; AVX2-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/urem-i8-constant.ll b/llvm/test/CodeGen/X86/urem-i8-constant.ll
index ae218405c0ef03..a2a0367cc1b0f3 100644
--- a/llvm/test/CodeGen/X86/urem-i8-constant.ll
+++ b/llvm/test/CodeGen/X86/urem-i8-constant.ll
@@ -10,8 +10,9 @@ define i8 @foo(i8 %tmp325) {
 ; CHECK-NEXT:    imull $111, %eax, %ecx
 ; CHECK-NEXT:    shrl $12, %ecx
 ; CHECK-NEXT:    leal (%ecx,%ecx,8), %edx
-; CHECK-NEXT:    leal (%ecx,%edx,4), %ecx
-; CHECK-NEXT:    subb %cl, %al
+; CHECK-NEXT:    shll $2, %edx
+; CHECK-NEXT:    addb %cl, %dl
+; CHECK-NEXT:    subb %dl, %al
 ; CHECK-NEXT:    # kill: def $al killed $al killed $eax
 ; CHECK-NEXT:    retl
   %t546 = urem i8 %tmp325, 37
diff --git a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
index b4e91da920a2fd..35e1edaae2d50f 100644
--- a/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
+++ b/llvm/test/CodeGen/X86/urem-seteq-illegal-types.ll
@@ -64,9 +64,10 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; X86:       # %bb.0:
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    leal (%eax,%eax,2), %ecx
-; X86-NEXT:    leal (%eax,%ecx,4), %eax
-; X86-NEXT:    andb $15, %al
-; X86-NEXT:    cmpb $4, %al
+; X86-NEXT:    shll $2, %ecx
+; X86-NEXT:    addb %al, %cl
+; X86-NEXT:    andb $15, %cl
+; X86-NEXT:    cmpb $4, %cl
 ; X86-NEXT:    setae %al
 ; X86-NEXT:    retl
 ;
@@ -74,9 +75,10 @@ define i1 @test_urem_odd_setne(i4 %X) nounwind {
 ; X64:       # %bb.0:
 ; X64-NEXT:    # kill: def $edi killed $edi def $rdi
 ; X64-NEXT:    leal (%rdi,%rdi,2), %eax
-; X64-NEXT:    leal (%rdi,%rax,4), %eax
-; X64-NEXT:    andb $15, %al
-; X64-NEXT:    cmpb $4, %al
+; X64-NEXT:    shll $2, %eax
+; X64-NEXT:    addb %al, %dil
+; X64-NEXT:    andb $15, %dil
+; X64-NEXT:    cmpb $4, %dil
 ; X64-NEXT:    setae %al
 ; X64-NEXT:    retq
   %urem = urem i4 %X, 5

>From 89d14d35e165d71c9dd5d1d8f24357c0bf705d7e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 23 Sep 2024 19:39:05 -0700
Subject: [PATCH 49/56] Fixup previous commit

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 9a8da33ecc3a3b..aef0400bb95fa0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15547,7 +15547,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
       SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
       SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
       SDValue TruncatedOp = DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
-      if (TLI.(TruncatedOp, SrcVT))
+      if (TLI.IsDesirableToPromoteOp(TruncatedOp, SrcVT))
         break;
       return TruncatedOp;
     }

>From e46d01e07a60acbe0f43471cc1a315117f6f32bf Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 23 Sep 2024 19:39:29 -0700
Subject: [PATCH 50/56] clang-format

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index aef0400bb95fa0..2bf8051775c5c0 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15546,7 +15546,8 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
         break;
       SDValue NarrowL = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(0));
       SDValue NarrowR = DAG.getNode(ISD::TRUNCATE, DL, VT, N0.getOperand(1));
-      SDValue TruncatedOp = DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
+      SDValue TruncatedOp =
+          DAG.getNode(N0.getOpcode(), DL, VT, NarrowL, NarrowR);
       if (TLI.IsDesirableToPromoteOp(TruncatedOp, SrcVT))
         break;
       return TruncatedOp;

>From 7082f5fe31cb360d3e6d241a8390ca09bd14274a Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 23 Sep 2024 20:13:37 -0700
Subject: [PATCH 51/56] Use new isNarrowingProfitable API

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 2bf8051775c5c0..805ab4b035097b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -5922,7 +5922,7 @@ SDValue DAGCombiner::hoistLogicOpWithSameOpcodeHands(SDNode *N) {
       return SDValue();
     // Prevent an infinite loop if the target prefers the inverse
     // transformation.
-    if (TLI.isNarrowingProfitable(XVT, VT))
+    if (TLI.isNarrowingProfitable(N, XVT, VT))
       return SDValue();
     SDValue Logic = DAG.getNode(LogicOpcode, DL, XVT, X, Y);
     return DAG.getNode(HandOpcode, DL, VT, Logic);
@@ -15532,7 +15532,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
-  if (TLI.isNarrowingProfitable(SrcVT, VT)) {
+  if (TLI.isNarrowingProfitable(N, SrcVT, VT)) {
     switch (N0.getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:

>From 888245d4e645c7692303b3606b9b688b332f0cdc Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 23 Sep 2024 20:13:47 -0700
Subject: [PATCH 52/56] Update tests on ToT

---
 llvm/test/CodeGen/X86/ctlo.ll                 |    6 +-
 llvm/test/CodeGen/X86/ctpop-mask.ll           |    9 +-
 llvm/test/CodeGen/X86/shift-i128.ll           |  387 +-
 llvm/test/CodeGen/X86/ucmp.ll                 | 1883 +++++--
 ...lar-shift-by-byte-multiple-legalization.ll | 4969 +++++++++--------
 .../X86/wide-scalar-shift-legalization.ll     | 3010 +++++-----
 ...ad-of-small-alloca-with-zero-upper-half.ll |  763 +--
 .../CodeGen/X86/widen-load-of-small-alloca.ll |   98 +-
 8 files changed, 6077 insertions(+), 5048 deletions(-)

diff --git a/llvm/test/CodeGen/X86/ctlo.ll b/llvm/test/CodeGen/X86/ctlo.ll
index 9de0618a3aa022..ac8ab6cebf0a3d 100644
--- a/llvm/test/CodeGen/X86/ctlo.ll
+++ b/llvm/test/CodeGen/X86/ctlo.ll
@@ -54,9 +54,9 @@ define i8 @ctlo_i8(i8 %x) {
 ;
 ; X86-CLZ-LABEL: ctlo_i8:
 ; X86-CLZ:       # %bb.0:
-; X86-CLZ-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-CLZ-NEXT:    shll $24, %eax
-; X86-CLZ-NEXT:    notl %eax
+; X86-CLZ-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
+; X86-CLZ-NEXT:    notb %al
+; X86-CLZ-NEXT:    movzbl %al, %eax
 ; X86-CLZ-NEXT:    lzcntl %eax, %eax
 ; X86-CLZ-NEXT:    addb $-24, %al
 ; X86-CLZ-NEXT:    # kill: def $al killed $al killed $eax
diff --git a/llvm/test/CodeGen/X86/ctpop-mask.ll b/llvm/test/CodeGen/X86/ctpop-mask.ll
index cbb0ae719e5aa3..4fbc59e29d304d 100644
--- a/llvm/test/CodeGen/X86/ctpop-mask.ll
+++ b/llvm/test/CodeGen/X86/ctpop-mask.ll
@@ -200,7 +200,7 @@ define i64 @ctpop_mask4(i64 %x) nounwind readnone {
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    andb $15, %dil
+; X64-NO-POPCOUNT-NEXT:    andl $15, %edi
 ; X64-NO-POPCOUNT-NEXT:    leal (,%rdi,4), %ecx
 ; X64-NO-POPCOUNT-NEXT:    movabsq $4841987667533046032, %rax # imm = 0x4332322132212110
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
@@ -240,10 +240,9 @@ define i32 @ctpop_shifted_mask4(i32 %x) nounwind readnone {
 ;
 ; X64-NO-POPCOUNT-LABEL: ctpop_shifted_mask4:
 ; X64-NO-POPCOUNT:       # %bb.0:
-; X64-NO-POPCOUNT-NEXT:    # kill: def $edi killed $edi def $rdi
-; X64-NO-POPCOUNT-NEXT:    shrl $9, %edi
-; X64-NO-POPCOUNT-NEXT:    andb $15, %dil
-; X64-NO-POPCOUNT-NEXT:    leal (,%rdi,4), %ecx
+; X64-NO-POPCOUNT-NEXT:    movl %edi, %ecx
+; X64-NO-POPCOUNT-NEXT:    shrl $7, %ecx
+; X64-NO-POPCOUNT-NEXT:    andb $60, %cl
 ; X64-NO-POPCOUNT-NEXT:    movabsq $4841987667533046032, %rax # imm = 0x4332322132212110
 ; X64-NO-POPCOUNT-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-POPCOUNT-NEXT:    shrq %cl, %rax
diff --git a/llvm/test/CodeGen/X86/shift-i128.ll b/llvm/test/CodeGen/X86/shift-i128.ll
index e5a158caf59b19..c6afb0fc386e51 100644
--- a/llvm/test/CodeGen/X86/shift-i128.ll
+++ b/llvm/test/CodeGen/X86/shift-i128.ll
@@ -253,105 +253,87 @@ define void @test_lshr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 16(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 12(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 8(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl 36(%ebp), %edi
-; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $112, %esp
+; i686-NEXT:    movl 40(%ebp), %ecx
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %edi
+; i686-NEXT:    movl 32(%ebp), %edx
+; i686-NEXT:    movl 20(%ebp), %esi
 ; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 16(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 36(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    andl $7, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl $3, %ebx
-; i686-NEXT:    andl $15, %ebx
-; i686-NEXT:    movl 32(%esp,%ebx), %eax
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrl $3, %eax
+; i686-NEXT:    andl $12, %eax
+; i686-NEXT:    movl 40(%esp,%eax), %edx
+; i686-NEXT:    movl 36(%esp,%eax), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 32(%esp,%eax), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 44(%esp,%eax), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %cl, %eax
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 36(%esp,%ebx), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    shrdl %cl, %eax, %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 28(%esp,%ebx), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %esi, %edx
-; i686-NEXT:    andl $7, %edx
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    movl %ecx, %esi
 ; i686-NEXT:    shrl $3, %esi
-; i686-NEXT:    andl $15, %esi
-; i686-NEXT:    movl 64(%esp,%esi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 68(%esp,%esi), %ebp
-; i686-NEXT:    leal (%ebp,%ebp), %edi
-; i686-NEXT:    shll %cl, %edi
-; i686-NEXT:    orl %eax, %edi
-; i686-NEXT:    movl 40(%esp,%ebx), %eax
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    movl %ebx, %ecx
+; i686-NEXT:    andl $12, %esi
+; i686-NEXT:    movl 72(%esp,%esi), %edi
+; i686-NEXT:    movl 68(%esp,%esi), %ebx
+; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %edi, %ebx
+; i686-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 64(%esp,%esi), %edx
+; i686-NEXT:    movl 76(%esp,%esi), %esi
+; i686-NEXT:    movl %ecx, %ebx
+; i686-NEXT:    shrdl %cl, %esi, %edi
+; i686-NEXT:    movl 40(%ebp), %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl 60(%esp,%esi), %ecx
-; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT:    movl 72(%esp,%esi), %esi
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrdl %cl, %esi, %ebp
-; i686-NEXT:    movl %ebx, %ecx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    movl %ebx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %ebx, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    shrdl %cl, %ebx, %edx
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; i686-NEXT:    movl 72(%ebp), %ecx
 ; i686-NEXT:    movl %esi, 28(%ecx)
-; i686-NEXT:    movl %ebp, 24(%ecx)
-; i686-NEXT:    movl (%esp), %edx # 4-byte Reload
+; i686-NEXT:    movl %edi, 24(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT:    movl %esi, 20(%ecx)
 ; i686-NEXT:    movl %edx, 16(%ecx)
 ; i686-NEXT:    movl %eax, 12(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    movl %eax, 8(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, (%ecx)
-; i686-NEXT:    movl %edi, 20(%ecx)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    movl %eax, 4(%ecx)
-; i686-NEXT:    addl $92, %esp
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, (%ecx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -396,97 +378,89 @@ define void @test_ashr_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) no
 ; i686-NEXT:    pushl %ebx
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
-; i686-NEXT:    subl $92, %esp
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    sarl $31, %ebp
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; i686-NEXT:    andl $-16, %esp
+; i686-NEXT:    subl $112, %esp
+; i686-NEXT:    movl 40(%ebp), %ecx
+; i686-NEXT:    movl 24(%ebp), %eax
+; i686-NEXT:    movl 28(%ebp), %edi
+; i686-NEXT:    movl 32(%ebp), %edx
+; i686-NEXT:    movl 16(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 12(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 8(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 20(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    sarl $31, %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl 36(%ebp), %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, %ecx
-; i686-NEXT:    andl $7, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl $3, %edi
-; i686-NEXT:    andl $15, %edi
-; i686-NEXT:    movl 32(%esp,%edi), %eax
+; i686-NEXT:    sarl $31, %esi
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; i686-NEXT:    movl %ecx, %eax
+; i686-NEXT:    shrl $3, %eax
+; i686-NEXT:    andl $12, %eax
+; i686-NEXT:    movl 40(%esp,%eax), %edx
+; i686-NEXT:    movl 36(%esp,%eax), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %edx, %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 32(%esp,%eax), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 44(%esp,%eax), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %cl, %eax
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 36(%esp,%edi), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    addl %edx, %edx
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    orl %eax, %edx
+; i686-NEXT:    shrdl %cl, %eax, %edx
 ; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %esi, %edx
-; i686-NEXT:    andl $7, %edx
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    movl %ecx, %esi
 ; i686-NEXT:    shrl $3, %esi
-; i686-NEXT:    andl $15, %esi
-; i686-NEXT:    movl 64(%esp,%esi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    notb %cl
-; i686-NEXT:    movl 68(%esp,%esi), %ebp
-; i686-NEXT:    leal (%ebp,%ebp), %ebx
-; i686-NEXT:    shll %cl, %ebx
-; i686-NEXT:    orl %eax, %ebx
-; i686-NEXT:    movl 28(%esp,%edi), %eax
-; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 40(%esp,%edi), %edi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shrdl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl 60(%esp,%esi), %ecx
-; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; i686-NEXT:    movl 72(%esp,%esi), %esi
-; i686-NEXT:    movl %edx, %ecx
-; i686-NEXT:    shrdl %cl, %esi, %ebp
-; i686-NEXT:    movl %eax, %ecx
+; i686-NEXT:    andl $12, %esi
+; i686-NEXT:    movl 72(%esp,%esi), %edi
+; i686-NEXT:    movl 68(%esp,%esi), %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    shrdl %cl, %edi, %edx
+; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 64(%esp,%esi), %ebx
+; i686-NEXT:    movl 76(%esp,%esi), %esi
+; i686-NEXT:    movl %ecx, %edx
+; i686-NEXT:    shrdl %cl, %esi, %edi
+; i686-NEXT:    movl 40(%ebp), %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; i686-NEXT:    shrdl %cl, %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    sarl %cl, %edi
-; i686-NEXT:    movl %edx, %ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shrdl %cl, %eax, (%esp) # 4-byte Folded Spill
+; i686-NEXT:    sarl %cl, %eax
+; i686-NEXT:    movl %edx, %ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; i686-NEXT:    shrdl %cl, %edx, %ebx
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    sarl %cl, %esi
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; i686-NEXT:    movl %esi, 28(%eax)
-; i686-NEXT:    movl %ebp, 24(%eax)
-; i686-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 16(%eax)
-; i686-NEXT:    movl %edi, 12(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 8(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, (%eax)
-; i686-NEXT:    movl %ebx, 20(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%eax)
-; i686-NEXT:    addl $92, %esp
+; i686-NEXT:    movl 72(%ebp), %ecx
+; i686-NEXT:    movl %esi, 28(%ecx)
+; i686-NEXT:    movl %edi, 24(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; i686-NEXT:    movl %edx, 20(%ecx)
+; i686-NEXT:    movl %ebx, 16(%ecx)
+; i686-NEXT:    movl %eax, 12(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 8(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 4(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, (%ecx)
+; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
 ; i686-NEXT:    popl %ebx
@@ -535,8 +509,8 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    pushl %edi
 ; i686-NEXT:    pushl %esi
 ; i686-NEXT:    andl $-16, %esp
-; i686-NEXT:    subl $128, %esp
-; i686-NEXT:    movl 40(%ebp), %edi
+; i686-NEXT:    subl $112, %esp
+; i686-NEXT:    movl 40(%ebp), %ebx
 ; i686-NEXT:    movl 24(%ebp), %eax
 ; i686-NEXT:    movl 28(%ebp), %ecx
 ; i686-NEXT:    movl 32(%ebp), %edx
@@ -553,7 +527,6 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl %edi, %ebx
 ; i686-NEXT:    shrl $3, %ebx
 ; i686-NEXT:    andl $12, %ebx
 ; i686-NEXT:    leal {{[0-9]+}}(%esp), %eax
@@ -562,28 +535,17 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%eax), %esi
+; i686-NEXT:    movl (%eax), %ecx
+; i686-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; i686-NEXT:    movl 4(%eax), %edx
-; i686-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; i686-NEXT:    movl 8(%eax), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl %edi, %ecx
-; i686-NEXT:    andl $31, %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shll %cl, %edx
-; i686-NEXT:    movl 4(%eax), %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %esi
+; i686-NEXT:    movl 40(%ebp), %ecx
 ; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
-; i686-NEXT:    notb %cl
-; i686-NEXT:    shrl %cl, %esi
-; i686-NEXT:    orl %edx, %esi
-; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; i686-NEXT:    movl (%eax), %eax
+; i686-NEXT:    shldl %cl, %edx, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 56(%ebp), %eax
-; i686-NEXT:    movl %eax, %edx
+; i686-NEXT:    movl %edx, %edi
+; i686-NEXT:    movl 56(%ebp), %edx
 ; i686-NEXT:    shrl $3, %edx
 ; i686-NEXT:    andl $12, %edx
 ; i686-NEXT:    leal {{[0-9]+}}(%esp), %ecx
@@ -592,54 +554,53 @@ define void @test_shl_v2i128(<2 x i128> %x, <2 x i128> %a, ptr nocapture %r) nou
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
 ; i686-NEXT:    movl $0, {{[0-9]+}}(%esp)
-; i686-NEXT:    movl (%ecx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 4(%ecx), %edi
-; i686-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl 8(%ecx), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    andl $31, %eax
+; i686-NEXT:    movl (%ecx), %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shrl %eax
-; i686-NEXT:    notb %cl
-; i686-NEXT:    shrl %cl, %eax
-; i686-NEXT:    orl %edi, %eax
-; i686-NEXT:    movl (%esi), %ecx
-; i686-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    movl %esi, %edi
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; i686-NEXT:    movl 4(%ecx), %esi
+; i686-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 8(%ecx), %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    shldl %cl, %esi, %eax
+; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; i686-NEXT:    movl (%esp), %esi # 4-byte Reload
+; i686-NEXT:    movl %esi, %eax
+; i686-NEXT:    movl 40(%ebp), %ecx
 ; i686-NEXT:    shll %cl, %eax
 ; i686-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; i686-NEXT:    shldl %cl, %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    shldl %cl, %esi, %edi
+; i686-NEXT:    movl %edi, (%esp) # 4-byte Spill
 ; i686-NEXT:    negl %ebx
-; i686-NEXT:    movl 76(%esp,%ebx), %ebx
+; i686-NEXT:    movl 60(%esp,%ebx), %ebx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    shldl %cl, %eax, %ebx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; i686-NEXT:    shldl %cl, %esi, %ebx
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; i686-NEXT:    movl %edi, %esi
+; i686-NEXT:    movl %esi, %edi
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    shll %cl, %edi
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    movl %eax, %ecx
-; i686-NEXT:    shll %cl, %esi
-; i686-NEXT:    shldl %cl, %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Folded Spill
+; i686-NEXT:    shldl %cl, %esi, %eax
 ; i686-NEXT:    negl %edx
-; i686-NEXT:    movl 108(%esp,%edx), %edx
+; i686-NEXT:    movl 92(%esp,%edx), %edx
+; i686-NEXT:    movl 56(%ebp), %ecx
+; i686-NEXT:    # kill: def $cl killed $cl killed $ecx
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; i686-NEXT:    shldl %cl, %esi, %edx
+; i686-NEXT:    movl 72(%ebp), %ecx
+; i686-NEXT:    movl %edx, 28(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; i686-NEXT:    movl %edx, 24(%ecx)
+; i686-NEXT:    movl %eax, 20(%ecx)
+; i686-NEXT:    movl %edi, 16(%ecx)
+; i686-NEXT:    movl %ebx, 12(%ecx)
 ; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; i686-NEXT:    shldl %cl, %eax, %edx
-; i686-NEXT:    movl 72(%ebp), %eax
-; i686-NEXT:    movl %edx, 28(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 24(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 20(%eax)
-; i686-NEXT:    movl %esi, 16(%eax)
-; i686-NEXT:    movl %ebx, 12(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 8(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, 4(%eax)
-; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; i686-NEXT:    movl %ecx, (%eax)
+; i686-NEXT:    movl %eax, 8(%ecx)
+; i686-NEXT:    movl (%esp), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, 4(%ecx)
+; i686-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; i686-NEXT:    movl %eax, (%ecx)
 ; i686-NEXT:    leal -12(%ebp), %esp
 ; i686-NEXT:    popl %esi
 ; i686-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/ucmp.ll b/llvm/test/CodeGen/X86/ucmp.ll
index 8db4213e66a5d8..b6f0295cfbe892 100644
--- a/llvm/test/CodeGen/X86/ucmp.ll
+++ b/llvm/test/CodeGen/X86/ucmp.ll
@@ -1470,410 +1470,1485 @@ define <16 x i8> @ucmp_wide_vec_op(<16 x i32> %x, <16 x i32> %y) nounwind {
 }
 
 define <17 x i2> @ucmp_uncommon_vectors(<17 x i71> %x, <17 x i71> %y) nounwind {
-; X64-LABEL: ucmp_uncommon_vectors:
-; X64:       # %bb.0:
-; X64-NEXT:    pushq %rbp
-; X64-NEXT:    pushq %r15
-; X64-NEXT:    pushq %r14
-; X64-NEXT:    pushq %r13
-; X64-NEXT:    pushq %r12
-; X64-NEXT:    pushq %rbx
-; X64-NEXT:    subq $120, %rsp
-; X64-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    andl $127, %edx
-; X64-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    andl $127, %r8d
-; X64-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, (%rsp) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    andl $127, %r12d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; X64-NEXT:    andl $127, %r14d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; X64-NEXT:    andl $127, %ebx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r15
-; X64-NEXT:    andl $127, %r15d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
-; X64-NEXT:    andl $127, %ebp
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r11
-; X64-NEXT:    andl $127, %r11d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r13
-; X64-NEXT:    andl $127, %r13d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; X64-NEXT:    andl $127, %r10d
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT:    andl $127, %esi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    andl $127, %edi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    andl $127, %eax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    andl $127, %edx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    cmpq %r9, %r8
-; X64-NEXT:    movq %rdx, %rcx
-; X64-NEXT:    sbbq %rax, %rcx
-; X64-NEXT:    setb %cl
-; X64-NEXT:    cmpq %r8, %r9
-; X64-NEXT:    sbbq %rdx, %rax
-; X64-NEXT:    sbbb $0, %cl
-; X64-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq %rdi, %rdx
-; X64-NEXT:    sbbq %rsi, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %rdi, %rsi
-; X64-NEXT:    sbbb $0, %dl
-; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq %r10, %rdx
-; X64-NEXT:    sbbq %r13, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %r10, %r13
-; X64-NEXT:    sbbb $0, %dl
-; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq %r11, %rdx
-; X64-NEXT:    sbbq %rbp, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %r11, %rbp
-; X64-NEXT:    sbbb $0, %dl
-; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq %r15, %rdx
-; X64-NEXT:    sbbq %rbx, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %r15, %rbx
-; X64-NEXT:    sbbb $0, %dl
-; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    cmpq %rax, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rdx
-; X64-NEXT:    sbbq %r14, %rdx
-; X64-NEXT:    setb %dl
-; X64-NEXT:    cmpq %rcx, %rax
-; X64-NEXT:    sbbq %rsi, %r14
-; X64-NEXT:    sbbb $0, %dl
-; X64-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    cmpq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
-; X64-NEXT:    movq %rsi, %rax
-; X64-NEXT:    sbbq %r12, %rax
-; X64-NEXT:    setb %r13b
-; X64-NEXT:    cmpq %rdx, %rcx
-; X64-NEXT:    sbbq %rsi, %r12
-; X64-NEXT:    sbbb $0, %r13b
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT:    cmpq %rdx, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movq %rdi, %rcx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %rcx
-; X64-NEXT:    setb %bpl
-; X64-NEXT:    cmpq %rsi, %rdx
-; X64-NEXT:    sbbq %rdi, %rax
-; X64-NEXT:    sbbb $0, %bpl
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    cmpq %rsi, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rdx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %rdx
-; X64-NEXT:    setb %r11b
-; X64-NEXT:    cmpq %rdi, %rsi
-; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    sbbb $0, %r11b
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    cmpq %rdi, %r8
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rsi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %rsi
-; X64-NEXT:    setb %sil
-; X64-NEXT:    cmpq %r8, %rdi
-; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    sbbb $0, %sil
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r8
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    cmpq %r8, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %rdi
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %rdi
-; X64-NEXT:    setb %dil
-; X64-NEXT:    cmpq %r9, %r8
-; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    sbbb $0, %dil
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r9
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; X64-NEXT:    cmpq %r9, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %r8
-; X64-NEXT:    movq (%rsp), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r8
-; X64-NEXT:    setb %r8b
-; X64-NEXT:    cmpq %r10, %r9
-; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    sbbb $0, %r8b
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r10
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; X64-NEXT:    cmpq %r10, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    movq %rcx, %r9
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r9
-; X64-NEXT:    setb %r9b
-; X64-NEXT:    cmpq %rbx, %r10
-; X64-NEXT:    sbbq %rcx, %rax
-; X64-NEXT:    sbbb $0, %r9b
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    cmpq %rax, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r10
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    sbbq %rcx, %r10
-; X64-NEXT:    setb %r10b
-; X64-NEXT:    cmpq %rbx, %rax
-; X64-NEXT:    sbbq %rdx, %rcx
-; X64-NEXT:    sbbb $0, %r10b
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    cmpq %rcx, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %rbx
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %rbx
-; X64-NEXT:    setb %bl
-; X64-NEXT:    cmpq %r14, %rcx
-; X64-NEXT:    sbbq %rdx, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r14
-; X64-NEXT:    sbbb $0, %bl
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    cmpq %rcx, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r15
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
-; X64-NEXT:    sbbq %rax, %r15
-; X64-NEXT:    setb %r15b
-; X64-NEXT:    cmpq %r14, %rcx
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %r12
-; X64-NEXT:    sbbq %rdx, %rax
-; X64-NEXT:    movq {{[0-9]+}}(%rsp), %rax
-; X64-NEXT:    sbbb $0, %r15b
-; X64-NEXT:    cmpq %r12, %rax
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdx # 8-byte Reload
-; X64-NEXT:    movq %rdx, %r14
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rcx # 8-byte Reload
-; X64-NEXT:    sbbq %rcx, %r14
-; X64-NEXT:    setb %r14b
-; X64-NEXT:    cmpq %rax, %r12
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    movd %eax, %xmm0
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    movd %eax, %xmm1
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    movd %eax, %xmm2
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    movd %eax, %xmm3
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    movd %eax, %xmm4
-; X64-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
-; X64-NEXT:    movd %eax, %xmm5
-; X64-NEXT:    movzbl %r13b, %eax
-; X64-NEXT:    movd %eax, %xmm6
-; X64-NEXT:    movzbl %bpl, %eax
-; X64-NEXT:    movd %eax, %xmm7
-; X64-NEXT:    movzbl %r11b, %eax
-; X64-NEXT:    movd %eax, %xmm8
-; X64-NEXT:    movzbl %sil, %eax
-; X64-NEXT:    movd %eax, %xmm9
-; X64-NEXT:    movzbl %dil, %eax
-; X64-NEXT:    movd %eax, %xmm10
-; X64-NEXT:    movzbl %r8b, %eax
-; X64-NEXT:    movd %eax, %xmm11
-; X64-NEXT:    movzbl %r9b, %eax
-; X64-NEXT:    movd %eax, %xmm12
-; X64-NEXT:    movzbl %r10b, %eax
-; X64-NEXT:    movd %eax, %xmm13
-; X64-NEXT:    movzbl %bl, %eax
-; X64-NEXT:    movd %eax, %xmm14
-; X64-NEXT:    movzbl %r15b, %eax
-; X64-NEXT:    movd %eax, %xmm15
-; X64-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
-; X64-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
-; X64-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
-; X64-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
-; X64-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0]
-; X64-NEXT:    sbbq %rdx, %rcx
-; X64-NEXT:    sbbb $0, %r14b
-; X64-NEXT:    andb $3, %r14b
-; X64-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
-; X64-NEXT:    movb %r14b, 4(%rdi)
-; X64-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $3, %ecx
-; X64-NEXT:    leaq (%rcx,%rax,4), %rax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $3, %ecx
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    shll $6, %eax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $3, %ecx
-; X64-NEXT:    shll $8, %ecx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    shll $10, %eax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-NEXT:    andl $3, %edx
-; X64-NEXT:    shll $12, %edx
-; X64-NEXT:    orq %rax, %rdx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
-; X64-NEXT:    andl $3, %esi
-; X64-NEXT:    shll $14, %esi
-; X64-NEXT:    orq %rdx, %rsi
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    shll $16, %eax
-; X64-NEXT:    orq %rsi, %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $3, %ecx
-; X64-NEXT:    shll $18, %ecx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-NEXT:    andl $3, %edx
-; X64-NEXT:    shll $20, %edx
-; X64-NEXT:    orq %rcx, %rdx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $3, %ecx
-; X64-NEXT:    shll $22, %ecx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-NEXT:    andl $3, %edx
-; X64-NEXT:    shll $24, %edx
-; X64-NEXT:    orq %rcx, %rdx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
-; X64-NEXT:    andl $3, %ecx
-; X64-NEXT:    shlq $26, %rcx
-; X64-NEXT:    orq %rdx, %rcx
-; X64-NEXT:    orq %rax, %rcx
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %eax
-; X64-NEXT:    andl $3, %eax
-; X64-NEXT:    shlq $28, %rax
-; X64-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
-; X64-NEXT:    andl $3, %edx
-; X64-NEXT:    shlq $30, %rdx
-; X64-NEXT:    orq %rax, %rdx
-; X64-NEXT:    orq %rcx, %rdx
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    movl %edx, (%rdi)
-; X64-NEXT:    addq $120, %rsp
-; X64-NEXT:    popq %rbx
-; X64-NEXT:    popq %r12
-; X64-NEXT:    popq %r13
-; X64-NEXT:    popq %r14
-; X64-NEXT:    popq %r15
-; X64-NEXT:    popq %rbp
-; X64-NEXT:    retq
+; SSE4-LABEL: ucmp_uncommon_vectors:
+; SSE4:       # %bb.0:
+; SSE4-NEXT:    pushq %rbp
+; SSE4-NEXT:    pushq %r15
+; SSE4-NEXT:    pushq %r14
+; SSE4-NEXT:    pushq %r13
+; SSE4-NEXT:    pushq %r12
+; SSE4-NEXT:    pushq %rbx
+; SSE4-NEXT:    subq $120, %rsp
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    andl $127, %edx
+; SSE4-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    andl $127, %r8d
+; SSE4-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE4-NEXT:    andl $127, %r13d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT:    andl $127, %ebx
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT:    andl $127, %r12d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    andl $127, %edx
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE4-NEXT:    andl $127, %ebp
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE4-NEXT:    andl $127, %r8d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    andl $127, %eax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT:    andl $127, %r14d
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT:    cmpq %r15, %r10
+; SSE4-NEXT:    movq %r14, %r11
+; SSE4-NEXT:    sbbq %rax, %r11
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    cmpq %r10, %r15
+; SSE4-NEXT:    sbbq %r14, %rax
+; SSE4-NEXT:    sbbb $0, %r11b
+; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT:    cmpq %rax, %r10
+; SSE4-NEXT:    movq %r8, %r11
+; SSE4-NEXT:    sbbq %rbp, %r11
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    cmpq %r10, %rax
+; SSE4-NEXT:    sbbq %r8, %rbp
+; SSE4-NEXT:    sbbb $0, %r11b
+; SSE4-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE4-NEXT:    cmpq %rax, %r8
+; SSE4-NEXT:    movq %rdx, %r10
+; SSE4-NEXT:    sbbq %r12, %r10
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    cmpq %r8, %rax
+; SSE4-NEXT:    sbbq %rdx, %r12
+; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq %rbx, %r8
+; SSE4-NEXT:    sbbq %r13, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %rdx, %rax
+; SSE4-NEXT:    sbbq %rbx, %r13
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE4-NEXT:    movq %r11, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %rdx, %rax
+; SSE4-NEXT:    sbbq %r11, %r10
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE4-NEXT:    movq %r11, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %rdx, %rax
+; SSE4-NEXT:    sbbq %r11, %r10
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE4-NEXT:    movq %r11, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %rdx, %rax
+; SSE4-NEXT:    sbbq %r11, %r10
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE4-NEXT:    movq %r11, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %rdx, %rax
+; SSE4-NEXT:    sbbq %r11, %r10
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE4-NEXT:    cmpq %rax, %rdx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE4-NEXT:    movq %r11, %r8
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r10, %r8
+; SSE4-NEXT:    setb %r8b
+; SSE4-NEXT:    cmpq %rdx, %rax
+; SSE4-NEXT:    sbbq %r11, %r10
+; SSE4-NEXT:    sbbb $0, %r8b
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE4-NEXT:    cmpq %rax, %r10
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE4-NEXT:    movq %rbx, %rdx
+; SSE4-NEXT:    movq (%rsp), %r11 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r11, %rdx
+; SSE4-NEXT:    setb %dl
+; SSE4-NEXT:    cmpq %r10, %rax
+; SSE4-NEXT:    sbbq %rbx, %r11
+; SSE4-NEXT:    sbbb $0, %dl
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE4-NEXT:    cmpq %rax, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    movq %r14, %r10
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE4-NEXT:    sbbq %rbx, %r10
+; SSE4-NEXT:    setb %r10b
+; SSE4-NEXT:    cmpq %r11, %rax
+; SSE4-NEXT:    sbbq %r14, %rbx
+; SSE4-NEXT:    sbbb $0, %r10b
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE4-NEXT:    cmpq %rax, %rbx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    movq %r15, %r11
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r14, %r11
+; SSE4-NEXT:    setb %r11b
+; SSE4-NEXT:    cmpq %rbx, %rax
+; SSE4-NEXT:    sbbq %r15, %r14
+; SSE4-NEXT:    sbbb $0, %r11b
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE4-NEXT:    cmpq %rax, %r12
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    movq %r15, %rbx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r14, %rbx
+; SSE4-NEXT:    setb %bl
+; SSE4-NEXT:    cmpq %r12, %rax
+; SSE4-NEXT:    sbbq %r15, %r14
+; SSE4-NEXT:    sbbb $0, %bl
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    cmpq %r9, %rax
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    movq %r15, %r12
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r14, %r12
+; SSE4-NEXT:    setb %r12b
+; SSE4-NEXT:    cmpq %rax, %r9
+; SSE4-NEXT:    sbbq %r15, %r14
+; SSE4-NEXT:    sbbb $0, %r12b
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    cmpq %rcx, %rax
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    movq %r15, %r9
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r14, %r9
+; SSE4-NEXT:    setb %r9b
+; SSE4-NEXT:    cmpq %rax, %rcx
+; SSE4-NEXT:    sbbq %r15, %r14
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE4-NEXT:    sbbb $0, %r9b
+; SSE4-NEXT:    cmpq %rsi, %rax
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    movq %r15, %rcx
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r14, %rcx
+; SSE4-NEXT:    setb %r13b
+; SSE4-NEXT:    cmpq %rax, %rsi
+; SSE4-NEXT:    sbbq %r15, %r14
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; SSE4-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE4-NEXT:    sbbb $0, %r13b
+; SSE4-NEXT:    cmpq %rsi, %r14
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbp # 8-byte Reload
+; SSE4-NEXT:    movq %rbp, %rax
+; SSE4-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE4-NEXT:    sbbq %r15, %rax
+; SSE4-NEXT:    movq %rdi, %rax
+; SSE4-NEXT:    setb %cl
+; SSE4-NEXT:    cmpq %r14, %rsi
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm0
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm1
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm2
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm3
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm4
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm5
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm6
+; SSE4-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE4-NEXT:    movd %esi, %xmm7
+; SSE4-NEXT:    movzbl %r8b, %esi
+; SSE4-NEXT:    movd %esi, %xmm8
+; SSE4-NEXT:    movzbl %dl, %edx
+; SSE4-NEXT:    movd %edx, %xmm9
+; SSE4-NEXT:    movzbl %r10b, %edx
+; SSE4-NEXT:    movd %edx, %xmm10
+; SSE4-NEXT:    movzbl %r11b, %edx
+; SSE4-NEXT:    movd %edx, %xmm11
+; SSE4-NEXT:    movzbl %bl, %edx
+; SSE4-NEXT:    movd %edx, %xmm12
+; SSE4-NEXT:    movzbl %r12b, %edx
+; SSE4-NEXT:    movd %edx, %xmm13
+; SSE4-NEXT:    movzbl %r9b, %edx
+; SSE4-NEXT:    movd %edx, %xmm14
+; SSE4-NEXT:    movzbl %r13b, %edx
+; SSE4-NEXT:    movd %edx, %xmm15
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1],xmm1[2],xmm0[2],xmm1[3],xmm0[3],xmm1[4],xmm0[4],xmm1[5],xmm0[5],xmm1[6],xmm0[6],xmm1[7],xmm0[7]
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm3 = xmm3[0],xmm2[0],xmm3[1],xmm2[1],xmm3[2],xmm2[2],xmm3[3],xmm2[3],xmm3[4],xmm2[4],xmm3[5],xmm2[5],xmm3[6],xmm2[6],xmm3[7],xmm2[7]
+; SSE4-NEXT:    punpcklwd {{.*#+}} xmm3 = xmm3[0],xmm1[0],xmm3[1],xmm1[1],xmm3[2],xmm1[2],xmm3[3],xmm1[3]
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm5 = xmm5[0],xmm4[0],xmm5[1],xmm4[1],xmm5[2],xmm4[2],xmm5[3],xmm4[3],xmm5[4],xmm4[4],xmm5[5],xmm4[5],xmm5[6],xmm4[6],xmm5[7],xmm4[7]
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm7 = xmm7[0],xmm6[0],xmm7[1],xmm6[1],xmm7[2],xmm6[2],xmm7[3],xmm6[3],xmm7[4],xmm6[4],xmm7[5],xmm6[5],xmm7[6],xmm6[6],xmm7[7],xmm6[7]
+; SSE4-NEXT:    punpcklwd {{.*#+}} xmm7 = xmm7[0],xmm5[0],xmm7[1],xmm5[1],xmm7[2],xmm5[2],xmm7[3],xmm5[3]
+; SSE4-NEXT:    punpckldq {{.*#+}} xmm7 = xmm7[0],xmm3[0],xmm7[1],xmm3[1]
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm9 = xmm9[0],xmm8[0],xmm9[1],xmm8[1],xmm9[2],xmm8[2],xmm9[3],xmm8[3],xmm9[4],xmm8[4],xmm9[5],xmm8[5],xmm9[6],xmm8[6],xmm9[7],xmm8[7]
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm11 = xmm11[0],xmm10[0],xmm11[1],xmm10[1],xmm11[2],xmm10[2],xmm11[3],xmm10[3],xmm11[4],xmm10[4],xmm11[5],xmm10[5],xmm11[6],xmm10[6],xmm11[7],xmm10[7]
+; SSE4-NEXT:    punpcklwd {{.*#+}} xmm11 = xmm11[0],xmm9[0],xmm11[1],xmm9[1],xmm11[2],xmm9[2],xmm11[3],xmm9[3]
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm13 = xmm13[0],xmm12[0],xmm13[1],xmm12[1],xmm13[2],xmm12[2],xmm13[3],xmm12[3],xmm13[4],xmm12[4],xmm13[5],xmm12[5],xmm13[6],xmm12[6],xmm13[7],xmm12[7]
+; SSE4-NEXT:    punpcklbw {{.*#+}} xmm15 = xmm15[0],xmm14[0],xmm15[1],xmm14[1],xmm15[2],xmm14[2],xmm15[3],xmm14[3],xmm15[4],xmm14[4],xmm15[5],xmm14[5],xmm15[6],xmm14[6],xmm15[7],xmm14[7]
+; SSE4-NEXT:    punpcklwd {{.*#+}} xmm15 = xmm15[0],xmm13[0],xmm15[1],xmm13[1],xmm15[2],xmm13[2],xmm15[3],xmm13[3]
+; SSE4-NEXT:    punpckldq {{.*#+}} xmm15 = xmm15[0],xmm11[0],xmm15[1],xmm11[1]
+; SSE4-NEXT:    punpcklqdq {{.*#+}} xmm15 = xmm15[0],xmm7[0]
+; SSE4-NEXT:    sbbq %rbp, %r15
+; SSE4-NEXT:    sbbb $0, %cl
+; SSE4-NEXT:    andb $3, %cl
+; SSE4-NEXT:    movb %cl, 4(%rdi)
+; SSE4-NEXT:    movdqa %xmm15, -{{[0-9]+}}(%rsp)
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE4-NEXT:    andl $3, %edx
+; SSE4-NEXT:    leaq (%rdx,%rcx,4), %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE4-NEXT:    andl $3, %edx
+; SSE4-NEXT:    shll $4, %edx
+; SSE4-NEXT:    orq %rcx, %rdx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $6, %ecx
+; SSE4-NEXT:    orq %rdx, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE4-NEXT:    andl $3, %edx
+; SSE4-NEXT:    shll $8, %edx
+; SSE4-NEXT:    orq %rcx, %rdx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $10, %ecx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT:    andl $3, %esi
+; SSE4-NEXT:    shll $12, %esi
+; SSE4-NEXT:    orq %rcx, %rsi
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edi
+; SSE4-NEXT:    andl $3, %edi
+; SSE4-NEXT:    shll $14, %edi
+; SSE4-NEXT:    orq %rsi, %rdi
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shll $16, %ecx
+; SSE4-NEXT:    orq %rdi, %rcx
+; SSE4-NEXT:    orq %rdx, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE4-NEXT:    andl $3, %edx
+; SSE4-NEXT:    shll $18, %edx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT:    andl $3, %esi
+; SSE4-NEXT:    shll $20, %esi
+; SSE4-NEXT:    orq %rdx, %rsi
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE4-NEXT:    andl $3, %edx
+; SSE4-NEXT:    shll $22, %edx
+; SSE4-NEXT:    orq %rsi, %rdx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT:    andl $3, %esi
+; SSE4-NEXT:    shll $24, %esi
+; SSE4-NEXT:    orq %rdx, %rsi
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %edx
+; SSE4-NEXT:    andl $3, %edx
+; SSE4-NEXT:    shlq $26, %rdx
+; SSE4-NEXT:    orq %rsi, %rdx
+; SSE4-NEXT:    orq %rcx, %rdx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %ecx
+; SSE4-NEXT:    andl $3, %ecx
+; SSE4-NEXT:    shlq $28, %rcx
+; SSE4-NEXT:    movzbl -{{[0-9]+}}(%rsp), %esi
+; SSE4-NEXT:    andl $3, %esi
+; SSE4-NEXT:    shlq $30, %rsi
+; SSE4-NEXT:    orq %rcx, %rsi
+; SSE4-NEXT:    orq %rdx, %rsi
+; SSE4-NEXT:    movl %esi, (%rax)
+; SSE4-NEXT:    addq $120, %rsp
+; SSE4-NEXT:    popq %rbx
+; SSE4-NEXT:    popq %r12
+; SSE4-NEXT:    popq %r13
+; SSE4-NEXT:    popq %r14
+; SSE4-NEXT:    popq %r15
+; SSE4-NEXT:    popq %rbp
+; SSE4-NEXT:    retq
+;
+; SSE2-LABEL: ucmp_uncommon_vectors:
+; SSE2:       # %bb.0:
+; SSE2-NEXT:    pushq %rbp
+; SSE2-NEXT:    pushq %r15
+; SSE2-NEXT:    pushq %r14
+; SSE2-NEXT:    pushq %r13
+; SSE2-NEXT:    pushq %r12
+; SSE2-NEXT:    pushq %rbx
+; SSE2-NEXT:    subq $88, %rsp
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    andl $127, %r8d
+; SSE2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    andl $127, %edx
+; SSE2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    andl $127, %eax
+; SSE2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; SSE2-NEXT:    andl $127, %r13d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE2-NEXT:    andl $127, %ebx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    andl $127, %edx
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT:    andl $127, %r8d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; SSE2-NEXT:    andl $127, %r12d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE2-NEXT:    andl $127, %r14d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; SSE2-NEXT:    andl $127, %ebp
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT:    andl $127, %r15d
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT:    cmpq %rax, %r11
+; SSE2-NEXT:    movq %r15, %r10
+; SSE2-NEXT:    sbbq %rbp, %r10
+; SSE2-NEXT:    setb %r10b
+; SSE2-NEXT:    cmpq %r11, %rax
+; SSE2-NEXT:    sbbq %r15, %rbp
+; SSE2-NEXT:    sbbb $0, %r10b
+; SSE2-NEXT:    movb %r10b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    cmpq %rax, %r10
+; SSE2-NEXT:    movq %r14, %r11
+; SSE2-NEXT:    sbbq %r12, %r11
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    cmpq %r10, %rax
+; SSE2-NEXT:    sbbq %r14, %r12
+; SSE2-NEXT:    sbbb $0, %r11b
+; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    cmpq %rax, %r10
+; SSE2-NEXT:    movq %r8, %r11
+; SSE2-NEXT:    sbbq %rdx, %r11
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    cmpq %r10, %rax
+; SSE2-NEXT:    sbbq %r8, %rdx
+; SSE2-NEXT:    sbbb $0, %r11b
+; SSE2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
+; SSE2-NEXT:    movq %rbx, %r8
+; SSE2-NEXT:    sbbq %r13, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %rbx, %r13
+; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %r11, %r10
+; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %r11, %r10
+; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %r11, %r10
+; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; SSE2-NEXT:    cmpq %rax, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r10, %r8
+; SSE2-NEXT:    setb %bpl
+; SSE2-NEXT:    cmpq %rdx, %rax
+; SSE2-NEXT:    sbbq %r11, %r10
+; SSE2-NEXT:    sbbb $0, %bpl
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; SSE2-NEXT:    cmpq %rax, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    movq %r11, %rdx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r10, %rdx
+; SSE2-NEXT:    setb %dl
+; SSE2-NEXT:    cmpq %r8, %rax
+; SSE2-NEXT:    sbbq %r11, %r10
+; SSE2-NEXT:    sbbb $0, %dl
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; SSE2-NEXT:    cmpq %rax, %r10
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    movq %rbx, %r8
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r11, %r8
+; SSE2-NEXT:    setb %r8b
+; SSE2-NEXT:    cmpq %r10, %rax
+; SSE2-NEXT:    sbbq %rbx, %r11
+; SSE2-NEXT:    sbbb $0, %r8b
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; SSE2-NEXT:    cmpq %rax, %r11
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; SSE2-NEXT:    movq %r14, %r10
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; SSE2-NEXT:    sbbq %rbx, %r10
+; SSE2-NEXT:    setb %r10b
+; SSE2-NEXT:    cmpq %r11, %rax
+; SSE2-NEXT:    sbbq %r14, %rbx
+; SSE2-NEXT:    sbbb $0, %r10b
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; SSE2-NEXT:    cmpq %rax, %rbx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    movq %r15, %r11
+; SSE2-NEXT:    movq (%rsp), %r14 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r14, %r11
+; SSE2-NEXT:    setb %r11b
+; SSE2-NEXT:    cmpq %rbx, %rax
+; SSE2-NEXT:    sbbq %r15, %r14
+; SSE2-NEXT:    sbbb $0, %r11b
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; SSE2-NEXT:    cmpq %rax, %r14
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE2-NEXT:    movq %r12, %rbx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r15, %rbx
+; SSE2-NEXT:    setb %bl
+; SSE2-NEXT:    cmpq %r14, %rax
+; SSE2-NEXT:    sbbq %r12, %r15
+; SSE2-NEXT:    sbbb $0, %bl
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    cmpq %r9, %rax
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE2-NEXT:    movq %r12, %r14
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r15, %r14
+; SSE2-NEXT:    setb %r14b
+; SSE2-NEXT:    cmpq %rax, %r9
+; SSE2-NEXT:    sbbq %r12, %r15
+; SSE2-NEXT:    sbbb $0, %r14b
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    cmpq %rsi, %rax
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE2-NEXT:    movq %r12, %r9
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r15, %r9
+; SSE2-NEXT:    setb %r9b
+; SSE2-NEXT:    cmpq %rax, %rsi
+; SSE2-NEXT:    sbbq %r12, %r15
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    sbbb $0, %r9b
+; SSE2-NEXT:    cmpq %rcx, %rax
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE2-NEXT:    movq %r12, %rsi
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r15, %rsi
+; SSE2-NEXT:    setb %sil
+; SSE2-NEXT:    cmpq %rax, %rcx
+; SSE2-NEXT:    sbbq %r12, %r15
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; SSE2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; SSE2-NEXT:    sbbb $0, %sil
+; SSE2-NEXT:    cmpq %rax, %r15
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; SSE2-NEXT:    movq %r13, %rcx
+; SSE2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; SSE2-NEXT:    sbbq %r12, %rcx
+; SSE2-NEXT:    setb %cl
+; SSE2-NEXT:    cmpq %r15, %rax
+; SSE2-NEXT:    sbbq %r13, %r12
+; SSE2-NEXT:    movq %rdi, %rax
+; SSE2-NEXT:    movzbl %sil, %esi
+; SSE2-NEXT:    sbbb $0, %cl
+; SSE2-NEXT:    movzbl %r9b, %edi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    leaq (%rdi,%rsi,4), %rsi
+; SSE2-NEXT:    movzbl %r14b, %edi
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shll $4, %edi
+; SSE2-NEXT:    orq %rsi, %rdi
+; SSE2-NEXT:    movzbl %bl, %r9d
+; SSE2-NEXT:    andl $3, %r9d
+; SSE2-NEXT:    shll $6, %r9d
+; SSE2-NEXT:    orq %rdi, %r9
+; SSE2-NEXT:    movzbl %r11b, %esi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    shll $8, %esi
+; SSE2-NEXT:    orq %r9, %rsi
+; SSE2-NEXT:    movzbl %r8b, %edi
+; SSE2-NEXT:    movzbl %r10b, %r8d
+; SSE2-NEXT:    andl $3, %r8d
+; SSE2-NEXT:    shll $10, %r8d
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shll $12, %edi
+; SSE2-NEXT:    orq %r8, %rdi
+; SSE2-NEXT:    movzbl %dl, %r8d
+; SSE2-NEXT:    andl $3, %r8d
+; SSE2-NEXT:    shll $14, %r8d
+; SSE2-NEXT:    orq %rdi, %r8
+; SSE2-NEXT:    movzbl %bpl, %edx
+; SSE2-NEXT:    andl $3, %edx
+; SSE2-NEXT:    shll $16, %edx
+; SSE2-NEXT:    orq %r8, %rdx
+; SSE2-NEXT:    orq %rsi, %rdx
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shll $18, %edi
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    shll $20, %esi
+; SSE2-NEXT:    orq %rdi, %rsi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shll $22, %edi
+; SSE2-NEXT:    orq %rsi, %rdi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    shll $24, %esi
+; SSE2-NEXT:    orq %rdi, %rsi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %edi
+; SSE2-NEXT:    shlq $26, %rdi
+; SSE2-NEXT:    orq %rsi, %rdi
+; SSE2-NEXT:    orq %rdx, %rdi
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; SSE2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; SSE2-NEXT:    andl $3, %esi
+; SSE2-NEXT:    shlq $28, %rsi
+; SSE2-NEXT:    andl $3, %edx
+; SSE2-NEXT:    shlq $30, %rdx
+; SSE2-NEXT:    orq %rsi, %rdx
+; SSE2-NEXT:    orq %rdi, %rdx
+; SSE2-NEXT:    andb $3, %cl
+; SSE2-NEXT:    movb %cl, 4(%rax)
+; SSE2-NEXT:    movl %edx, (%rax)
+; SSE2-NEXT:    addq $88, %rsp
+; SSE2-NEXT:    popq %rbx
+; SSE2-NEXT:    popq %r12
+; SSE2-NEXT:    popq %r13
+; SSE2-NEXT:    popq %r14
+; SSE2-NEXT:    popq %r15
+; SSE2-NEXT:    popq %rbp
+; SSE2-NEXT:    retq
+;
+; AVX2-LABEL: ucmp_uncommon_vectors:
+; AVX2:       # %bb.0:
+; AVX2-NEXT:    pushq %rbp
+; AVX2-NEXT:    pushq %r15
+; AVX2-NEXT:    pushq %r14
+; AVX2-NEXT:    pushq %r13
+; AVX2-NEXT:    pushq %r12
+; AVX2-NEXT:    pushq %rbx
+; AVX2-NEXT:    subq $88, %rsp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andl $127, %r8d
+; AVX2-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    andl $127, %edx
+; AVX2-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX2-NEXT:    andl $127, %r10d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    andl $127, %r15d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    andl $127, %edx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX2-NEXT:    andl $127, %r13d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    andl $127, %eax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    andl $127, %r14d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX2-NEXT:    andl $127, %r12d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    andl $127, %r8d
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmpq %rbp, %rbx
+; AVX2-NEXT:    movq %r8, %r11
+; AVX2-NEXT:    sbbq %r12, %r11
+; AVX2-NEXT:    setb %r11b
+; AVX2-NEXT:    cmpq %rbx, %rbp
+; AVX2-NEXT:    sbbq %r8, %r12
+; AVX2-NEXT:    sbbb $0, %r11b
+; AVX2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    cmpq %r8, %r11
+; AVX2-NEXT:    movq %r14, %rbx
+; AVX2-NEXT:    sbbq %rax, %rbx
+; AVX2-NEXT:    setb %bl
+; AVX2-NEXT:    cmpq %r11, %r8
+; AVX2-NEXT:    sbbq %r14, %rax
+; AVX2-NEXT:    sbbb $0, %bl
+; AVX2-NEXT:    movb %bl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    cmpq %rax, %r8
+; AVX2-NEXT:    movq %r13, %r11
+; AVX2-NEXT:    sbbq %rdx, %r11
+; AVX2-NEXT:    setb %r11b
+; AVX2-NEXT:    cmpq %r8, %rax
+; AVX2-NEXT:    sbbq %r13, %rdx
+; AVX2-NEXT:    sbbb $0, %r11b
+; AVX2-NEXT:    movb %r11b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    movq %r15, %r8
+; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    sbbq %r15, %r10
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    movq %r11, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    movq %r11, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    movq %r11, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movb %r8b, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX2-NEXT:    cmpq %rax, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    movq %r11, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r10, %r8
+; AVX2-NEXT:    setb %bpl
+; AVX2-NEXT:    cmpq %rdx, %rax
+; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    sbbb $0, %bpl
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    cmpq %rax, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    movq %r11, %rdx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r10, %rdx
+; AVX2-NEXT:    setb %dl
+; AVX2-NEXT:    cmpq %r8, %rax
+; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    sbbb $0, %dl
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX2-NEXT:    cmpq %rax, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    movq %rbx, %r10
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r11, %r10
+; AVX2-NEXT:    setb %r10b
+; AVX2-NEXT:    cmpq %r8, %rax
+; AVX2-NEXT:    sbbq %rbx, %r11
+; AVX2-NEXT:    sbbb $0, %r10b
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX2-NEXT:    cmpq %rax, %r11
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX2-NEXT:    movq %r14, %r8
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX2-NEXT:    sbbq %rbx, %r8
+; AVX2-NEXT:    setb %r8b
+; AVX2-NEXT:    cmpq %r11, %rax
+; AVX2-NEXT:    sbbq %r14, %rbx
+; AVX2-NEXT:    sbbb $0, %r8b
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX2-NEXT:    cmpq %rax, %rbx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    movq %r15, %r11
+; AVX2-NEXT:    movq (%rsp), %r14 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r14, %r11
+; AVX2-NEXT:    setb %r11b
+; AVX2-NEXT:    cmpq %rbx, %rax
+; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    sbbb $0, %r11b
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX2-NEXT:    cmpq %rax, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    movq %r12, %rbx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r15, %rbx
+; AVX2-NEXT:    setb %bl
+; AVX2-NEXT:    cmpq %r14, %rax
+; AVX2-NEXT:    sbbq %r12, %r15
+; AVX2-NEXT:    sbbb $0, %bl
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    cmpq %r9, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    movq %r12, %r14
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r15, %r14
+; AVX2-NEXT:    setb %r14b
+; AVX2-NEXT:    cmpq %rax, %r9
+; AVX2-NEXT:    sbbq %r12, %r15
+; AVX2-NEXT:    sbbb $0, %r14b
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX2-NEXT:    cmpq %rsi, %rax
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX2-NEXT:    movq %r12, %r9
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r15, %r9
+; AVX2-NEXT:    setb %r9b
+; AVX2-NEXT:    cmpq %rax, %rsi
+; AVX2-NEXT:    sbbq %r12, %r15
+; AVX2-NEXT:    movq %rdi, %r12
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT:    sbbb $0, %r9b
+; AVX2-NEXT:    cmpq %rcx, %rsi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    movq %rax, %rdi
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r15, %rdi
+; AVX2-NEXT:    setb %dil
+; AVX2-NEXT:    cmpq %rsi, %rcx
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX2-NEXT:    sbbq %rax, %r15
+; AVX2-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX2-NEXT:    sbbb $0, %dil
+; AVX2-NEXT:    cmpq %rsi, %r15
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX2-NEXT:    movq %rax, %rcx
+; AVX2-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r13 # 8-byte Reload
+; AVX2-NEXT:    sbbq %r13, %rcx
+; AVX2-NEXT:    setb %cl
+; AVX2-NEXT:    cmpq %r15, %rsi
+; AVX2-NEXT:    sbbq %rax, %r13
+; AVX2-NEXT:    sbbb $0, %cl
+; AVX2-NEXT:    movzbl %dil, %esi
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    movzbl %r9b, %edi
+; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    leaq (%rdi,%rsi,4), %rsi
+; AVX2-NEXT:    movzbl %r14b, %edi
+; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    shll $4, %edi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    movzbl %bl, %r9d
+; AVX2-NEXT:    andl $3, %r9d
+; AVX2-NEXT:    shll $6, %r9d
+; AVX2-NEXT:    orq %rdi, %r9
+; AVX2-NEXT:    movzbl %r11b, %esi
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    shll $8, %esi
+; AVX2-NEXT:    orq %r9, %rsi
+; AVX2-NEXT:    movzbl %r8b, %edi
+; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    shll $10, %edi
+; AVX2-NEXT:    movzbl %r10b, %r8d
+; AVX2-NEXT:    andl $3, %r8d
+; AVX2-NEXT:    shll $12, %r8d
+; AVX2-NEXT:    orq %rdi, %r8
+; AVX2-NEXT:    movzbl %dl, %edi
+; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    shll $14, %edi
+; AVX2-NEXT:    orq %r8, %rdi
+; AVX2-NEXT:    movzbl %bpl, %edx
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    shll $16, %edx
+; AVX2-NEXT:    orq %rdi, %rdx
+; AVX2-NEXT:    orq %rsi, %rdx
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    shll $18, %esi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    shll $20, %edi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    shll $22, %esi
+; AVX2-NEXT:    orq %rdi, %rsi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    shll $24, %edi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; AVX2-NEXT:    andl $3, %esi
+; AVX2-NEXT:    shlq $26, %rsi
+; AVX2-NEXT:    orq %rdi, %rsi
+; AVX2-NEXT:    orq %rdx, %rsi
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX2-NEXT:    andl $3, %edx
+; AVX2-NEXT:    shlq $28, %rdx
+; AVX2-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edi # 1-byte Folded Reload
+; AVX2-NEXT:    andl $3, %edi
+; AVX2-NEXT:    shlq $30, %rdi
+; AVX2-NEXT:    orq %rdx, %rdi
+; AVX2-NEXT:    orq %rsi, %rdi
+; AVX2-NEXT:    andb $3, %cl
+; AVX2-NEXT:    movb %cl, 4(%r12)
+; AVX2-NEXT:    movl %edi, (%r12)
+; AVX2-NEXT:    movq %r12, %rax
+; AVX2-NEXT:    addq $88, %rsp
+; AVX2-NEXT:    popq %rbx
+; AVX2-NEXT:    popq %r12
+; AVX2-NEXT:    popq %r13
+; AVX2-NEXT:    popq %r14
+; AVX2-NEXT:    popq %r15
+; AVX2-NEXT:    popq %rbp
+; AVX2-NEXT:    retq
+;
+; AVX512-LABEL: ucmp_uncommon_vectors:
+; AVX512:       # %bb.0:
+; AVX512-NEXT:    pushq %rbp
+; AVX512-NEXT:    pushq %r15
+; AVX512-NEXT:    pushq %r14
+; AVX512-NEXT:    pushq %r13
+; AVX512-NEXT:    pushq %r12
+; AVX512-NEXT:    pushq %rbx
+; AVX512-NEXT:    subq $88, %rsp
+; AVX512-NEXT:    movq %r9, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rcx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rsi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq %rdi, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andl $127, %r8d
+; AVX512-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    andl $127, %edx
+; AVX512-NEXT:    movq %rdx, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, (%rsp) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq %rax, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbp
+; AVX512-NEXT:    andl $127, %ebp
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r12
+; AVX512-NEXT:    andl $127, %r12d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r13
+; AVX512-NEXT:    andl $127, %r13d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX512-NEXT:    andl $127, %r15d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    andl $127, %r10d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rbx
+; AVX512-NEXT:    andl $127, %ebx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT:    andl $127, %r8d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r9
+; AVX512-NEXT:    andl $127, %r9d
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT:    andl $127, %esi
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT:    andl $127, %edi
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    andl $127, %eax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    andl $127, %edx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r11
+; AVX512-NEXT:    cmpq %r14, %r11
+; AVX512-NEXT:    movq %rdx, %rcx
+; AVX512-NEXT:    sbbq %rax, %rcx
+; AVX512-NEXT:    setb %cl
+; AVX512-NEXT:    cmpq %r11, %r14
+; AVX512-NEXT:    sbbq %rdx, %rax
+; AVX512-NEXT:    sbbb $0, %cl
+; AVX512-NEXT:    movb %cl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq %rdi, %rdx
+; AVX512-NEXT:    sbbq %rsi, %rdx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %rdi, %rsi
+; AVX512-NEXT:    sbbb $0, %dl
+; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq %r9, %rdx
+; AVX512-NEXT:    sbbq %r8, %rdx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %r9, %r8
+; AVX512-NEXT:    sbbb $0, %dl
+; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq %rbx, %rdx
+; AVX512-NEXT:    sbbq %r10, %rdx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %rbx, %r10
+; AVX512-NEXT:    sbbb $0, %dl
+; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq %r15, %rdx
+; AVX512-NEXT:    sbbq %r13, %rdx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %r15, %r13
+; AVX512-NEXT:    sbbb $0, %dl
+; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq %r12, %rdx
+; AVX512-NEXT:    sbbq %rbp, %rdx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %r12, %rbp
+; AVX512-NEXT:    sbbb $0, %dl
+; AVX512-NEXT:    movb %dl, {{[-0-9]+}}(%r{{[sb]}}p) # 1-byte Spill
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT:    movq %rdi, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    sbbq %rsi, %rdx
+; AVX512-NEXT:    setb %r13b
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %rdi, %rsi
+; AVX512-NEXT:    sbbb $0, %r13b
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT:    movq %rdi, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    sbbq %rsi, %rdx
+; AVX512-NEXT:    setb %bpl
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %rdi, %rsi
+; AVX512-NEXT:    sbbb $0, %bpl
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdx
+; AVX512-NEXT:    cmpq %rcx, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT:    movq %rdi, %rax
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rsi # 8-byte Reload
+; AVX512-NEXT:    sbbq %rsi, %rax
+; AVX512-NEXT:    setb %r9b
+; AVX512-NEXT:    cmpq %rdx, %rcx
+; AVX512-NEXT:    sbbq %rdi, %rsi
+; AVX512-NEXT:    sbbb $0, %r9b
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rsi
+; AVX512-NEXT:    cmpq %rcx, %rsi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rdi # 8-byte Reload
+; AVX512-NEXT:    movq %rdi, %rdx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    sbbq %rax, %rdx
+; AVX512-NEXT:    setb %dl
+; AVX512-NEXT:    cmpq %rsi, %rcx
+; AVX512-NEXT:    sbbq %rdi, %rax
+; AVX512-NEXT:    sbbb $0, %dl
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rdi
+; AVX512-NEXT:    cmpq %rcx, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r8 # 8-byte Reload
+; AVX512-NEXT:    movq %r8, %rsi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    sbbq %rax, %rsi
+; AVX512-NEXT:    setb %sil
+; AVX512-NEXT:    cmpq %rdi, %rcx
+; AVX512-NEXT:    sbbq %r8, %rax
+; AVX512-NEXT:    sbbb $0, %sil
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r8
+; AVX512-NEXT:    cmpq %rcx, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r10 # 8-byte Reload
+; AVX512-NEXT:    movq %r10, %rdi
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    sbbq %rax, %rdi
+; AVX512-NEXT:    setb %dil
+; AVX512-NEXT:    cmpq %r8, %rcx
+; AVX512-NEXT:    sbbq %r10, %rax
+; AVX512-NEXT:    sbbb $0, %dil
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r10
+; AVX512-NEXT:    cmpq %rcx, %r10
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r11 # 8-byte Reload
+; AVX512-NEXT:    movq %r11, %r8
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    sbbq %rax, %r8
+; AVX512-NEXT:    setb %r8b
+; AVX512-NEXT:    cmpq %r10, %rcx
+; AVX512-NEXT:    sbbq %r11, %rax
+; AVX512-NEXT:    sbbb $0, %r8b
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    cmpq %rax, %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    movq %rbx, %r10
+; AVX512-NEXT:    movq (%rsp), %r11 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r11, %r10
+; AVX512-NEXT:    setb %r10b
+; AVX512-NEXT:    cmpq %rcx, %rax
+; AVX512-NEXT:    sbbq %rbx, %r11
+; AVX512-NEXT:    sbbb $0, %r10b
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rbx # 8-byte Reload
+; AVX512-NEXT:    cmpq %rbx, %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT:    movq %r14, %r11
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    sbbq %rax, %r11
+; AVX512-NEXT:    setb %r11b
+; AVX512-NEXT:    cmpq %rcx, %rbx
+; AVX512-NEXT:    sbbq %r14, %rax
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %rcx
+; AVX512-NEXT:    sbbb $0, %r11b
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r14 # 8-byte Reload
+; AVX512-NEXT:    cmpq %r14, %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    movq %rax, %rbx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r15 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r15, %rbx
+; AVX512-NEXT:    setb %bl
+; AVX512-NEXT:    cmpq %rcx, %r14
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r14
+; AVX512-NEXT:    sbbq %rax, %r15
+; AVX512-NEXT:    movq {{[0-9]+}}(%rsp), %r15
+; AVX512-NEXT:    sbbb $0, %bl
+; AVX512-NEXT:    cmpq %r14, %r15
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    movq %rax, %rcx
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %r12 # 8-byte Reload
+; AVX512-NEXT:    sbbq %r12, %rcx
+; AVX512-NEXT:    setb %cl
+; AVX512-NEXT:    cmpq %r15, %r14
+; AVX512-NEXT:    sbbq %rax, %r12
+; AVX512-NEXT:    sbbb $0, %cl
+; AVX512-NEXT:    movzbl %bl, %ebx
+; AVX512-NEXT:    andl $3, %ebx
+; AVX512-NEXT:    movzbl %r11b, %r11d
+; AVX512-NEXT:    andl $3, %r11d
+; AVX512-NEXT:    leaq (%r11,%rbx,4), %r11
+; AVX512-NEXT:    movzbl %r10b, %r10d
+; AVX512-NEXT:    andl $3, %r10d
+; AVX512-NEXT:    shll $4, %r10d
+; AVX512-NEXT:    orq %r11, %r10
+; AVX512-NEXT:    movzbl %r8b, %r8d
+; AVX512-NEXT:    andl $3, %r8d
+; AVX512-NEXT:    shll $6, %r8d
+; AVX512-NEXT:    orq %r10, %r8
+; AVX512-NEXT:    movzbl %dil, %edi
+; AVX512-NEXT:    andl $3, %edi
+; AVX512-NEXT:    shll $8, %edi
+; AVX512-NEXT:    orq %r8, %rdi
+; AVX512-NEXT:    movzbl %sil, %esi
+; AVX512-NEXT:    andl $3, %esi
+; AVX512-NEXT:    shll $10, %esi
+; AVX512-NEXT:    movzbl %dl, %edx
+; AVX512-NEXT:    andl $3, %edx
+; AVX512-NEXT:    shll $12, %edx
+; AVX512-NEXT:    orq %rsi, %rdx
+; AVX512-NEXT:    movzbl %r9b, %esi
+; AVX512-NEXT:    andl $3, %esi
+; AVX512-NEXT:    shll $14, %esi
+; AVX512-NEXT:    orq %rdx, %rsi
+; AVX512-NEXT:    movzbl %bpl, %eax
+; AVX512-NEXT:    andl $3, %eax
+; AVX512-NEXT:    shll $16, %eax
+; AVX512-NEXT:    orq %rsi, %rax
+; AVX512-NEXT:    orq %rdi, %rax
+; AVX512-NEXT:    movzbl %r13b, %edx
+; AVX512-NEXT:    andl $3, %edx
+; AVX512-NEXT:    shll $18, %edx
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; AVX512-NEXT:    andl $3, %esi
+; AVX512-NEXT:    shll $20, %esi
+; AVX512-NEXT:    orq %rdx, %rsi
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512-NEXT:    andl $3, %edx
+; AVX512-NEXT:    shll $22, %edx
+; AVX512-NEXT:    orq %rsi, %rdx
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; AVX512-NEXT:    andl $3, %esi
+; AVX512-NEXT:    shll $24, %esi
+; AVX512-NEXT:    orq %rdx, %rsi
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %edx # 1-byte Folded Reload
+; AVX512-NEXT:    andl $3, %edx
+; AVX512-NEXT:    shlq $26, %rdx
+; AVX512-NEXT:    orq %rsi, %rdx
+; AVX512-NEXT:    orq %rax, %rdx
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %eax # 1-byte Folded Reload
+; AVX512-NEXT:    andl $3, %eax
+; AVX512-NEXT:    shlq $28, %rax
+; AVX512-NEXT:    movzbl {{[-0-9]+}}(%r{{[sb]}}p), %esi # 1-byte Folded Reload
+; AVX512-NEXT:    andl $3, %esi
+; AVX512-NEXT:    shlq $30, %rsi
+; AVX512-NEXT:    orq %rax, %rsi
+; AVX512-NEXT:    orq %rdx, %rsi
+; AVX512-NEXT:    andb $3, %cl
+; AVX512-NEXT:    movq {{[-0-9]+}}(%r{{[sb]}}p), %rax # 8-byte Reload
+; AVX512-NEXT:    movb %cl, 4(%rax)
+; AVX512-NEXT:    movl %esi, (%rax)
+; AVX512-NEXT:    addq $88, %rsp
+; AVX512-NEXT:    popq %rbx
+; AVX512-NEXT:    popq %r12
+; AVX512-NEXT:    popq %r13
+; AVX512-NEXT:    popq %r14
+; AVX512-NEXT:    popq %r15
+; AVX512-NEXT:    popq %rbp
+; AVX512-NEXT:    retq
 ;
 ; X86-LABEL: ucmp_uncommon_vectors:
 ; X86:       # %bb.0:
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
index 277525796824bd..27d89eaa96df0a 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-by-byte-multiple-legalization.ll
@@ -12071,7 +12071,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
-; FALLBACK0-NEXT:    andl $56, %eax
 ; FALLBACK0-NEXT:    andl $56, %edi
 ; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %r10
 ; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
@@ -12079,6 +12078,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK0-NEXT:    movl %eax, %ecx
 ; FALLBACK0-NEXT:    shrq %cl, %r11
 ; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    andb $56, %sil
 ; FALLBACK0-NEXT:    notb %sil
 ; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %rbx
 ; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
@@ -12173,7 +12173,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK1-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK1-NEXT:    andl $56, %ecx
 ; FALLBACK1-NEXT:    andl $56, %eax
 ; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
 ; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
@@ -12240,54 +12239,54 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK2-NEXT:    andl $56, %ecx
 ; FALLBACK2-NEXT:    andl $56, %eax
 ; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
 ; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
 ; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r11
+; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbx
 ; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
 ; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT:    shrxq %rcx, %r11, %rbp
 ; FALLBACK2-NEXT:    movl %ecx, %r12d
+; FALLBACK2-NEXT:    andb $56, %r12b
 ; FALLBACK2-NEXT:    notb %r12b
-; FALLBACK2-NEXT:    addq %r9, %r9
-; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
-; FALLBACK2-NEXT:    orq %rbx, %r9
+; FALLBACK2-NEXT:    addq %r10, %r10
+; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
 ; FALLBACK2-NEXT:    orq %r13, %rdi
-; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r13
 ; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK2-NEXT:    shrxq %rcx, %rax, %rcx
-; FALLBACK2-NEXT:    addq %r10, %r10
-; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT:    orq %r8, %r10
+; FALLBACK2-NEXT:    addq %r11, %r11
+; FALLBACK2-NEXT:    shlxq %r12, %r11, %r11
+; FALLBACK2-NEXT:    orq %r8, %r11
 ; FALLBACK2-NEXT:    addq %rsi, %rsi
 ; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT:    orq %rbx, %rsi
+; FALLBACK2-NEXT:    leaq (%r9,%r9), %r8
 ; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
 ; FALLBACK2-NEXT:    orq %r15, %r8
 ; FALLBACK2-NEXT:    addq %r14, %r14
-; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT:    orq %rbp, %r11
+; FALLBACK2-NEXT:    shlxq %r12, %r14, %r9
+; FALLBACK2-NEXT:    orq %rbp, %r9
 ; FALLBACK2-NEXT:    addq %rax, %rax
 ; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
 ; FALLBACK2-NEXT:    orq %r13, %rax
 ; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 32(%rdx)
 ; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
 ; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rdi, (%rdx)
-; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
@@ -12325,7 +12324,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK3-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK3-NEXT:    andl $56, %ecx
 ; FALLBACK3-NEXT:    andl $56, %eax
 ; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
 ; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
@@ -12344,17 +12342,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK3-NEXT:    shrdq %cl, %r14, %r10
 ; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK3-NEXT:    shrdq %cl, %rax, %r11
-; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
-; FALLBACK3-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT:    shrxq %rcx, %rax, %rax
 ; FALLBACK3-NEXT:    movq %r11, 48(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK3-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK3-NEXT:    movq %r15, 40(%rdx)
 ; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
 ; FALLBACK3-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK3-NEXT:    movq %rsi, (%rdx)
 ; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
-; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK3-NEXT:    popq %rbx
 ; FALLBACK3-NEXT:    popq %r14
 ; FALLBACK3-NEXT:    popq %r15
@@ -12384,13 +12381,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK4-NEXT:    leal (,%r8,8), %eax
-; FALLBACK4-NEXT:    andl $56, %eax
 ; FALLBACK4-NEXT:    andl $56, %r8d
 ; FALLBACK4-NEXT:    movq -128(%rsp,%r8), %r10
 ; FALLBACK4-NEXT:    movq -120(%rsp,%r8), %r9
 ; FALLBACK4-NEXT:    movl %eax, %ecx
 ; FALLBACK4-NEXT:    shrq %cl, %r10
 ; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    andb $56, %sil
 ; FALLBACK4-NEXT:    notb %sil
 ; FALLBACK4-NEXT:    leaq (%r9,%r9), %rdi
 ; FALLBACK4-NEXT:    movl %esi, %ecx
@@ -12480,7 +12477,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK5-NEXT:    andl $56, %ecx
 ; FALLBACK5-NEXT:    andl $56, %eax
 ; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r9
@@ -12540,9 +12536,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    leal (,%rax,8), %esi
-; FALLBACK6-NEXT:    andl $56, %esi
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %rbx
 ; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
 ; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
 ; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
@@ -12551,34 +12546,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
 ; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
 ; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT:    movl %esi, %ebx
-; FALLBACK6-NEXT:    notb %bl
+; FALLBACK6-NEXT:    movl %esi, %r11d
+; FALLBACK6-NEXT:    andb $56, %r11b
+; FALLBACK6-NEXT:    notb %r11b
 ; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
 ; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT:    orq %r11, %r8
-; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    shlxq %r11, %r8, %r8
+; FALLBACK6-NEXT:    orq %rbx, %r8
+; FALLBACK6-NEXT:    leaq (%r13,%r13), %rbx
+; FALLBACK6-NEXT:    shlxq %r11, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r12, %rbx
 ; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
 ; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
 ; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
 ; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK6-NEXT:    shrxq %rsi, %rax, %rsi
 ; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %r11, %rdi, %rdi
 ; FALLBACK6-NEXT:    orq %r9, %rdi
 ; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT:    shlxq %r11, %r9, %r9
 ; FALLBACK6-NEXT:    orq %r14, %r9
 ; FALLBACK6-NEXT:    addq %r10, %r10
-; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT:    shlxq %r11, %r10, %r10
 ; FALLBACK6-NEXT:    orq %r15, %r10
 ; FALLBACK6-NEXT:    addq %rax, %rax
-; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT:    shlxq %r11, %rax, %rax
 ; FALLBACK6-NEXT:    orq %r13, %rax
 ; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %r11, %rcx, %rcx
 ; FALLBACK6-NEXT:    orq %rbp, %rcx
 ; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
 ; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
@@ -12586,7 +12582,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
 ; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %r8, (%rdx)
 ; FALLBACK6-NEXT:    addq $8, %rsp
 ; FALLBACK6-NEXT:    popq %rbx
@@ -12617,7 +12613,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK7-NEXT:    andl $56, %ecx
 ; FALLBACK7-NEXT:    andl $56, %eax
 ; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r9
@@ -12637,17 +12632,16 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rax
 ; FALLBACK7-NEXT:    movq %rax, %r15
 ; FALLBACK7-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK7-NEXT:    shrxq %rcx, %r11, %r10
-; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; FALLBACK7-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT:    shrxq %rcx, %r11, %rax
 ; FALLBACK7-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK7-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK7-NEXT:    movq %rdi, 32(%rdx)
 ; FALLBACK7-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
 ; FALLBACK7-NEXT:    movq %rsi, 24(%rdx)
 ; FALLBACK7-NEXT:    movq %r14, (%rdx)
-; FALLBACK7-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK7-NEXT:    popq %rbx
 ; FALLBACK7-NEXT:    popq %r14
 ; FALLBACK7-NEXT:    popq %r15
@@ -12671,13 +12665,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK8-NEXT:    leal (,%r9,8), %eax
-; FALLBACK8-NEXT:    andl $56, %eax
 ; FALLBACK8-NEXT:    andl $56, %r9d
 ; FALLBACK8-NEXT:    movq -128(%rsp,%r9), %r10
 ; FALLBACK8-NEXT:    movq -120(%rsp,%r9), %r8
 ; FALLBACK8-NEXT:    movl %eax, %ecx
 ; FALLBACK8-NEXT:    shrq %cl, %r10
 ; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    andb $56, %sil
 ; FALLBACK8-NEXT:    notb %sil
 ; FALLBACK8-NEXT:    leaq (%r8,%r8), %rdi
 ; FALLBACK8-NEXT:    movl %esi, %ecx
@@ -12755,43 +12749,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK9-NEXT:    pushq %rbx
 ; FALLBACK9-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK9-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK9-NEXT:    movl (%rsi), %eax
+; FALLBACK9-NEXT:    movl (%rsi), %edi
 ; FALLBACK9-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK9-NEXT:    andl $56, %ecx
-; FALLBACK9-NEXT:    andl $56, %eax
-; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
-; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
-; FALLBACK9-NEXT:    movq %r9, %rsi
-; FALLBACK9-NEXT:    shrdq %cl, %rdi, %rsi
-; FALLBACK9-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK9-NEXT:    leal (,%rdi,8), %ecx
+; FALLBACK9-NEXT:    andl $56, %edi
+; FALLBACK9-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK9-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK9-NEXT:    movq %r9, %rax
+; FALLBACK9-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK9-NEXT:    movq -112(%rsp,%rdi), %r10
 ; FALLBACK9-NEXT:    movq %r10, %r8
 ; FALLBACK9-NEXT:    shrdq %cl, %r9, %r8
-; FALLBACK9-NEXT:    movq -80(%rsp,%rax), %r9
-; FALLBACK9-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK9-NEXT:    movq -88(%rsp,%rdi), %r11
 ; FALLBACK9-NEXT:    movq %r11, %rbx
 ; FALLBACK9-NEXT:    shrdq %cl, %r9, %rbx
-; FALLBACK9-NEXT:    shrdq %cl, %r11, %rdi
-; FALLBACK9-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK9-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK9-NEXT:    movq -72(%rsp,%rdi), %r11
 ; FALLBACK9-NEXT:    shrdq %cl, %r11, %r9
-; FALLBACK9-NEXT:    movq -128(%rsp,%rax), %r14
-; FALLBACK9-NEXT:    movq -120(%rsp,%rax), %rax
-; FALLBACK9-NEXT:    movq %rax, %r15
+; FALLBACK9-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK9-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK9-NEXT:    movq %rdi, %r15
 ; FALLBACK9-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK9-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK9-NEXT:    shrdq %cl, %rdi, %r14
 ; FALLBACK9-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK9-NEXT:    shrq %cl, %r11
 ; FALLBACK9-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK9-NEXT:    movq %r9, 48(%rdx)
 ; FALLBACK9-NEXT:    movq %r11, 56(%rdx)
-; FALLBACK9-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK9-NEXT:    movq %rsi, 32(%rdx)
 ; FALLBACK9-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK9-NEXT:    movq %r8, 16(%rdx)
-; FALLBACK9-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK9-NEXT:    movq %rax, 24(%rdx)
 ; FALLBACK9-NEXT:    movq %r14, (%rdx)
 ; FALLBACK9-NEXT:    popq %rbx
 ; FALLBACK9-NEXT:    popq %r14
@@ -12817,9 +12810,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    leal (,%rax,8), %esi
-; FALLBACK10-NEXT:    andl $56, %esi
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %rbx
 ; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
 ; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
 ; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
@@ -12828,34 +12820,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
 ; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
 ; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT:    movl %esi, %ebx
-; FALLBACK10-NEXT:    notb %bl
+; FALLBACK10-NEXT:    movl %esi, %r11d
+; FALLBACK10-NEXT:    andb $56, %r11b
+; FALLBACK10-NEXT:    notb %r11b
 ; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
 ; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT:    orq %r11, %r8
-; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    shlxq %r11, %r8, %r8
+; FALLBACK10-NEXT:    orq %rbx, %r8
+; FALLBACK10-NEXT:    leaq (%r13,%r13), %rbx
+; FALLBACK10-NEXT:    shlxq %r11, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r12, %rbx
 ; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
 ; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
 ; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
 ; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK10-NEXT:    shrxq %rsi, %rax, %rsi
 ; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %r11, %rdi, %rdi
 ; FALLBACK10-NEXT:    orq %r9, %rdi
 ; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT:    shlxq %r11, %r9, %r9
 ; FALLBACK10-NEXT:    orq %r14, %r9
 ; FALLBACK10-NEXT:    addq %r10, %r10
-; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT:    shlxq %r11, %r10, %r10
 ; FALLBACK10-NEXT:    orq %r15, %r10
 ; FALLBACK10-NEXT:    addq %rax, %rax
-; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT:    shlxq %r11, %rax, %rax
 ; FALLBACK10-NEXT:    orq %r13, %rax
 ; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %r11, %rcx, %rcx
 ; FALLBACK10-NEXT:    orq %rbp, %rcx
 ; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
 ; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
@@ -12863,7 +12856,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
 ; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK10-NEXT:    movq %r8, (%rdx)
 ; FALLBACK10-NEXT:    addq $8, %rsp
 ; FALLBACK10-NEXT:    popq %rbx
@@ -12882,44 +12875,42 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK11-NEXT:    pushq %rbx
 ; FALLBACK11-NEXT:    vmovups (%rdi), %ymm0
 ; FALLBACK11-NEXT:    vmovups 32(%rdi), %ymm1
-; FALLBACK11-NEXT:    movl (%rsi), %eax
+; FALLBACK11-NEXT:    movl (%rsi), %edi
 ; FALLBACK11-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK11-NEXT:    vmovups %ymm2, -{{[0-9]+}}(%rsp)
 ; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
-; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK11-NEXT:    andl $56, %ecx
-; FALLBACK11-NEXT:    andl $56, %eax
-; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %rdi
-; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r9
-; FALLBACK11-NEXT:    movq %r9, %rsi
-; FALLBACK11-NEXT:    shrdq %cl, %rdi, %rsi
-; FALLBACK11-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK11-NEXT:    leal (,%rdi,8), %ecx
+; FALLBACK11-NEXT:    andl $56, %edi
+; FALLBACK11-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK11-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK11-NEXT:    movq %r9, %rax
+; FALLBACK11-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK11-NEXT:    movq -112(%rsp,%rdi), %r10
 ; FALLBACK11-NEXT:    movq %r10, %r8
 ; FALLBACK11-NEXT:    shrdq %cl, %r9, %r8
-; FALLBACK11-NEXT:    movq -80(%rsp,%rax), %r9
-; FALLBACK11-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK11-NEXT:    movq -88(%rsp,%rdi), %r11
 ; FALLBACK11-NEXT:    movq %r11, %rbx
 ; FALLBACK11-NEXT:    shrdq %cl, %r9, %rbx
-; FALLBACK11-NEXT:    shrdq %cl, %r11, %rdi
-; FALLBACK11-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK11-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK11-NEXT:    movq -72(%rsp,%rdi), %r11
 ; FALLBACK11-NEXT:    shrdq %cl, %r11, %r9
-; FALLBACK11-NEXT:    movq -128(%rsp,%rax), %r14
-; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rax
-; FALLBACK11-NEXT:    movq %rax, %r15
+; FALLBACK11-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK11-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK11-NEXT:    movq %rdi, %r15
 ; FALLBACK11-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK11-NEXT:    shrxq %rcx, %r11, %r10
-; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
-; FALLBACK11-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT:    shrdq %cl, %rdi, %r14
+; FALLBACK11-NEXT:    shrxq %rcx, %r11, %rcx
 ; FALLBACK11-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK11-NEXT:    movq %r9, 48(%rdx)
-; FALLBACK11-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK11-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK11-NEXT:    movq %rsi, 32(%rdx)
 ; FALLBACK11-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
-; FALLBACK11-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 24(%rdx)
 ; FALLBACK11-NEXT:    movq %r14, (%rdx)
-; FALLBACK11-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK11-NEXT:    popq %rbx
 ; FALLBACK11-NEXT:    popq %r14
 ; FALLBACK11-NEXT:    popq %r15
@@ -12941,13 +12932,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    leal (,%r9,8), %eax
-; FALLBACK12-NEXT:    andl $56, %eax
 ; FALLBACK12-NEXT:    andl $56, %r9d
 ; FALLBACK12-NEXT:    movq -128(%rsp,%r9), %r10
 ; FALLBACK12-NEXT:    movq -120(%rsp,%r9), %r8
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %r10
 ; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    andb $56, %sil
 ; FALLBACK12-NEXT:    notb %sil
 ; FALLBACK12-NEXT:    leaq (%r8,%r8), %rdi
 ; FALLBACK12-NEXT:    movl %esi, %ecx
@@ -13029,7 +13020,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK13-NEXT:    leal (,%rdi,8), %ecx
-; FALLBACK13-NEXT:    andl $56, %ecx
 ; FALLBACK13-NEXT:    andl $56, %edi
 ; FALLBACK13-NEXT:    movq -96(%rsp,%rdi), %rsi
 ; FALLBACK13-NEXT:    movq -104(%rsp,%rdi), %r9
@@ -13081,9 +13071,8 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    leal (,%rsi,8), %ecx
-; FALLBACK14-NEXT:    andl $56, %ecx
 ; FALLBACK14-NEXT:    andl $56, %esi
-; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %r11
+; FALLBACK14-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rbx
 ; FALLBACK14-NEXT:    movq -112(%rsp,%rsi), %rax
 ; FALLBACK14-NEXT:    movq -104(%rsp,%rsi), %rdi
 ; FALLBACK14-NEXT:    shrxq %rcx, %rdi, %r12
@@ -13092,34 +13081,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq -88(%rsp,%rsi), %r10
 ; FALLBACK14-NEXT:    shrxq %rcx, %r10, %r14
 ; FALLBACK14-NEXT:    shrxq %rcx, %r13, %r15
-; FALLBACK14-NEXT:    movl %ecx, %ebx
-; FALLBACK14-NEXT:    notb %bl
+; FALLBACK14-NEXT:    movl %ecx, %r11d
+; FALLBACK14-NEXT:    andb $56, %r11b
+; FALLBACK14-NEXT:    notb %r11b
 ; FALLBACK14-NEXT:    movq -120(%rsp,%rsi), %rbp
 ; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT:    orq %r11, %r8
-; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    shlxq %r11, %r8, %r8
+; FALLBACK14-NEXT:    orq %rbx, %r8
+; FALLBACK14-NEXT:    leaq (%r13,%r13), %rbx
+; FALLBACK14-NEXT:    shlxq %r11, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r12, %rbx
 ; FALLBACK14-NEXT:    movq -80(%rsp,%rsi), %r12
 ; FALLBACK14-NEXT:    shrxq %rcx, %r12, %r13
 ; FALLBACK14-NEXT:    shrxq %rcx, %rbp, %rbp
 ; FALLBACK14-NEXT:    movq -72(%rsp,%rsi), %rsi
 ; FALLBACK14-NEXT:    shrxq %rcx, %rsi, %rcx
 ; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %r11, %rdi, %rdi
 ; FALLBACK14-NEXT:    orq %r9, %rdi
 ; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT:    shlxq %r11, %r9, %r9
 ; FALLBACK14-NEXT:    orq %r14, %r9
 ; FALLBACK14-NEXT:    addq %r10, %r10
-; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT:    shlxq %r11, %r10, %r10
 ; FALLBACK14-NEXT:    orq %r15, %r10
 ; FALLBACK14-NEXT:    addq %rsi, %rsi
-; FALLBACK14-NEXT:    shlxq %rbx, %rsi, %rsi
+; FALLBACK14-NEXT:    shlxq %r11, %rsi, %rsi
 ; FALLBACK14-NEXT:    orq %r13, %rsi
 ; FALLBACK14-NEXT:    addq %rax, %rax
-; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT:    shlxq %r11, %rax, %rax
 ; FALLBACK14-NEXT:    orq %rbp, %rax
 ; FALLBACK14-NEXT:    movq %rcx, 56(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 8(%rdx)
@@ -13127,7 +13117,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
 ; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK14-NEXT:    movq %r8, (%rdx)
 ; FALLBACK14-NEXT:    addq $8, %rsp
 ; FALLBACK14-NEXT:    popq %rbx
@@ -13145,42 +13135,40 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK15-NEXT:    pushq %r14
 ; FALLBACK15-NEXT:    pushq %rbx
 ; FALLBACK15-NEXT:    vmovups (%rdi), %zmm0
-; FALLBACK15-NEXT:    movl (%rsi), %eax
+; FALLBACK15-NEXT:    movl (%rsi), %edi
 ; FALLBACK15-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
-; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK15-NEXT:    andl $56, %ecx
-; FALLBACK15-NEXT:    andl $56, %eax
-; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
-; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
-; FALLBACK15-NEXT:    movq %r9, %rsi
-; FALLBACK15-NEXT:    shrdq %cl, %rdi, %rsi
-; FALLBACK15-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK15-NEXT:    leal (,%rdi,8), %ecx
+; FALLBACK15-NEXT:    andl $56, %edi
+; FALLBACK15-NEXT:    movq -96(%rsp,%rdi), %rsi
+; FALLBACK15-NEXT:    movq -104(%rsp,%rdi), %r9
+; FALLBACK15-NEXT:    movq %r9, %rax
+; FALLBACK15-NEXT:    shrdq %cl, %rsi, %rax
+; FALLBACK15-NEXT:    movq -112(%rsp,%rdi), %r10
 ; FALLBACK15-NEXT:    movq %r10, %r8
 ; FALLBACK15-NEXT:    shrdq %cl, %r9, %r8
-; FALLBACK15-NEXT:    movq -80(%rsp,%rax), %r9
-; FALLBACK15-NEXT:    movq -88(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    movq -80(%rsp,%rdi), %r9
+; FALLBACK15-NEXT:    movq -88(%rsp,%rdi), %r11
 ; FALLBACK15-NEXT:    movq %r11, %rbx
 ; FALLBACK15-NEXT:    shrdq %cl, %r9, %rbx
-; FALLBACK15-NEXT:    shrdq %cl, %r11, %rdi
-; FALLBACK15-NEXT:    movq -72(%rsp,%rax), %r11
+; FALLBACK15-NEXT:    shrdq %cl, %r11, %rsi
+; FALLBACK15-NEXT:    movq -72(%rsp,%rdi), %r11
 ; FALLBACK15-NEXT:    shrdq %cl, %r11, %r9
-; FALLBACK15-NEXT:    movq -128(%rsp,%rax), %r14
-; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
-; FALLBACK15-NEXT:    movq %rax, %r15
+; FALLBACK15-NEXT:    movq -128(%rsp,%rdi), %r14
+; FALLBACK15-NEXT:    movq -120(%rsp,%rdi), %rdi
+; FALLBACK15-NEXT:    movq %rdi, %r15
 ; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK15-NEXT:    shrxq %rcx, %r11, %r10
-; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
-; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT:    shrdq %cl, %rdi, %r14
+; FALLBACK15-NEXT:    shrxq %rcx, %r11, %rcx
 ; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
-; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
+; FALLBACK15-NEXT:    movq %rcx, 56(%rdx)
+; FALLBACK15-NEXT:    movq %rsi, 32(%rdx)
 ; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
-; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 24(%rdx)
 ; FALLBACK15-NEXT:    movq %r14, (%rdx)
-; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK15-NEXT:    popq %rbx
 ; FALLBACK15-NEXT:    popq %r14
 ; FALLBACK15-NEXT:    popq %r15
@@ -13215,198 +13203,202 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl 36(%eax), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 40(%eax), %ebp
-; FALLBACK16-NEXT:    movl 44(%eax), %ebx
-; FALLBACK16-NEXT:    movl 48(%eax), %edi
-; FALLBACK16-NEXT:    movl 52(%eax), %esi
-; FALLBACK16-NEXT:    movl 56(%eax), %edx
-; FALLBACK16-NEXT:    movl 60(%eax), %ecx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    movl 40(%eax), %ebx
+; FALLBACK16-NEXT:    movl 44(%eax), %edi
+; FALLBACK16-NEXT:    movl 48(%eax), %esi
+; FALLBACK16-NEXT:    movl 52(%eax), %edx
+; FALLBACK16-NEXT:    movl 56(%eax), %ecx
+; FALLBACK16-NEXT:    movl 60(%eax), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT:    movl (%ebp), %ebp
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %eax, %esi
-; FALLBACK16-NEXT:    andl $60, %esi
-; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK16-NEXT:    shll $3, %eax
-; FALLBACK16-NEXT:    andl $24, %eax
-; FALLBACK16-NEXT:    movl %edx, %edi
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movl %ebp, %ecx
+; FALLBACK16-NEXT:    andl $60, %ecx
+; FALLBACK16-NEXT:    movl 68(%esp,%ecx), %edi
+; FALLBACK16-NEXT:    movl %ecx, %ebx
+; FALLBACK16-NEXT:    shll $3, %edx
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 72(%esp,%ebx), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK16-NEXT:    movb %al, %ch
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT:    movb %dl, %ch
+; FALLBACK16-NEXT:    andb $24, %ch
 ; FALLBACK16-NEXT:    notb %ch
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %edi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %edi, %edx
+; FALLBACK16-NEXT:    shll %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 80(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK16-NEXT:    movl %edx, %ebp
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %ebp, %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %ebp, %ebx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movl 84(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 88(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
 ; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    movl %eax, %edx
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    addl %eax, %eax
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %ebp, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movb %bl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    addl %ebp, %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl 92(%esp,%eax), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 96(%esp,%eax), %edi
 ; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
 ; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %ebx, %eax
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    addl %ebp, %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %eax, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK16-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %esi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 100(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 104(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %eax
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %ebp, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
 ; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    addl %ebp, %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK16-NEXT:    movl %edi, %ebp
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 108(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 112(%esp,%ebx), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
 ; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %ebp, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %esi
 ; FALLBACK16-NEXT:    addl %edi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    orl %edx, %edi
-; FALLBACK16-NEXT:    movl %esi, %edx
-; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK16-NEXT:    movl %esi, %ebx
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebx
-; FALLBACK16-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK16-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %ebx, %ebp
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    orl %esi, %edi
+; FALLBACK16-NEXT:    movl 116(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    movl %esi, %eax
 ; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl 120(%esp,%ebx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %edx
 ; FALLBACK16-NEXT:    addl %esi, %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    orl %ebx, %esi
-; FALLBACK16-NEXT:    movb %dl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 124(%esp,%ebx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %eax, %edx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edx, %ebp
+; FALLBACK16-NEXT:    movl %eax, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %ebx
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK16-NEXT:    movl %edx, 56(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK16-NEXT:    movl %esi, 48(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK16-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
@@ -13504,91 +13496,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl %ecx, %ebp
-; FALLBACK17-NEXT:    andl $60, %ebp
-; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shll $3, %ecx
-; FALLBACK17-NEXT:    andl $24, %ecx
-; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %esi
-; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %ecx, %eax
+; FALLBACK17-NEXT:    andl $60, %eax
+; FALLBACK17-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK17-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shll $3, %ecx
 ; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edx
-; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edx
-; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl %esi, %edx
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edi
-; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edi
 ; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %edi, %esi
 ; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK17-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK17-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl %esi, 56(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %ebp
 ; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK17-NEXT:    shrl %cl, %eax
-; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK17-NEXT:    movl %ebx, (%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    shrl %cl, %edx
+; FALLBACK17-NEXT:    movl %edx, 60(%eax)
+; FALLBACK17-NEXT:    movl %edi, 48(%eax)
+; FALLBACK17-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK17-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK17-NEXT:    movl %ebp, (%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK17-NEXT:    addl $188, %esp
 ; FALLBACK17-NEXT:    popl %esi
 ; FALLBACK17-NEXT:    popl %edi
@@ -13665,13 +13656,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    leal (,%eax,8), %edx
-; FALLBACK18-NEXT:    andl $24, %edx
 ; FALLBACK18-NEXT:    andl $60, %ecx
 ; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    andb $24, %bl
 ; FALLBACK18-NEXT:    notb %bl
 ; FALLBACK18-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
@@ -13872,7 +13863,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK19-NEXT:    shll $3, %ecx
-; FALLBACK19-NEXT:    andl $24, %ecx
 ; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
@@ -13969,7 +13959,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    movl (%eax), %ecx
 ; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -13979,106 +13969,110 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %eax, %esi
+; FALLBACK20-NEXT:    movl %ecx, %esi
 ; FALLBACK20-NEXT:    andl $60, %esi
-; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK20-NEXT:    shll $3, %eax
-; FALLBACK20-NEXT:    andl $24, %eax
-; FALLBACK20-NEXT:    movl %edx, %edi
-; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl 68(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    shll $3, %ecx
+; FALLBACK20-NEXT:    movl %ebx, %edi
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl 72(%esp,%esi), %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT:    movl %ecx, %edx
+; FALLBACK20-NEXT:    movl %ecx, %eax
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT:    movb %al, %ch
-; FALLBACK20-NEXT:    notb %ch
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    andb $24, %dl
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %edx, %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %edi, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK20-NEXT:    movl %edx, %ebp
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    addl %edx, %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    addl %eax, %eax
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
 ; FALLBACK20-NEXT:    addl %ebx, %ebx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
 ; FALLBACK20-NEXT:    orl %edi, %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl 76(%esp,%esi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    addl %ebx, %ebx
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl 80(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %eax, %ebx
+; FALLBACK20-NEXT:    orl %ebp, %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK20-NEXT:    addl %ebx, %ebx
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
 ; FALLBACK20-NEXT:    orl %edi, %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 88(%esp,%esi), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebp, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebp, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 96(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebp, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    movl 104(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebp, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    addl %edi, %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
 ; FALLBACK20-NEXT:    movl %edi, %ebp
 ; FALLBACK20-NEXT:    movl %eax, %ecx
@@ -14086,52 +14080,55 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
 ; FALLBACK20-NEXT:    orl %ebp, %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %ebx
 ; FALLBACK20-NEXT:    addl %edi, %edi
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    orl %edx, %edi
-; FALLBACK20-NEXT:    movl %esi, %edx
+; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    movl %esi, %ebp
 ; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
 ; FALLBACK20-NEXT:    movl %esi, %ebx
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    orl %ebx, %ebp
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 120(%esp,%ebp), %ebp
+; FALLBACK20-NEXT:    leal (%ebp,%ebp), %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %eax
+; FALLBACK20-NEXT:    orl %ebx, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %cl, %ebx
 ; FALLBACK20-NEXT:    addl %esi, %esi
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %esi
 ; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl %ebp, %eax
 ; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %eax, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 124(%esp,%ecx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %eax, %ebp
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK20-NEXT:    shrl %cl, %ebx
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK20-NEXT:    movl %edx, 56(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK20-NEXT:    movl %esi, 48(%eax)
-; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK20-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
@@ -14185,91 +14182,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK21-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK21-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK21-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT:    movl %ecx, %ebp
-; FALLBACK21-NEXT:    andl $60, %ebp
-; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shll $3, %ecx
-; FALLBACK21-NEXT:    andl $24, %ecx
-; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %esi
-; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %ecx, %eax
+; FALLBACK21-NEXT:    andl $60, %eax
+; FALLBACK21-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK21-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shll $3, %ecx
 ; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK21-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edx
-; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK21-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edx
-; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl %esi, %edx
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edi
-; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edi
 ; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl %edi, %esi
 ; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK21-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK21-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK21-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl %esi, 56(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %ebp
 ; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK21-NEXT:    shrl %cl, %eax
-; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK21-NEXT:    movl %ebx, (%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK21-NEXT:    shrl %cl, %edx
+; FALLBACK21-NEXT:    movl %edx, 60(%eax)
+; FALLBACK21-NEXT:    movl %edi, 48(%eax)
+; FALLBACK21-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK21-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK21-NEXT:    movl %ebp, (%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK21-NEXT:    addl $188, %esp
 ; FALLBACK21-NEXT:    popl %esi
 ; FALLBACK21-NEXT:    popl %edi
@@ -14290,7 +14286,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK22-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK22-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK22-NEXT:    movl (%eax), %ecx
+; FALLBACK22-NEXT:    movl (%eax), %edx
 ; FALLBACK22-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -14300,112 +14296,115 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK22-NEXT:    leal (,%ecx,8), %edx
-; FALLBACK22-NEXT:    andl $24, %edx
-; FALLBACK22-NEXT:    andl $60, %ecx
-; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
-; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    leal (,%edx,8), %ecx
+; FALLBACK22-NEXT:    andl $60, %edx
+; FALLBACK22-NEXT:    movl 68(%esp,%edx), %esi
+; FALLBACK22-NEXT:    movl 72(%esp,%edx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %edi
-; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %edi
+; FALLBACK22-NEXT:    movl %ecx, %ebx
+; FALLBACK22-NEXT:    andb $24, %bl
 ; FALLBACK22-NEXT:    notb %bl
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebp
-; FALLBACK22-NEXT:    orl %edi, %ebp
-; FALLBACK22-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
+; FALLBACK22-NEXT:    orl %edi, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl %ecx, %eax
+; FALLBACK22-NEXT:    shrxl %ecx, 64(%esp,%edx), %edi
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %edi, %esi
-; FALLBACK22-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 80(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
+; FALLBACK22-NEXT:    orl %edi, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 80(%esp,%edx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 76(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
-; FALLBACK22-NEXT:    orl %ebp, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %ecx
+; FALLBACK22-NEXT:    movl 76(%esp,%edx), %edi
+; FALLBACK22-NEXT:    shrxl %eax, %edi, %ebp
+; FALLBACK22-NEXT:    orl %ebp, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %eax, %ecx
 ; FALLBACK22-NEXT:    addl %edi, %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
-; FALLBACK22-NEXT:    orl %eax, %edi
+; FALLBACK22-NEXT:    orl %ebp, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 88(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl 88(%esp,%edx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 84(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    movl 84(%esp,%edx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK22-NEXT:    addl %edi, %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
 ; FALLBACK22-NEXT:    orl %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 96(%esp,%ecx), %esi
+; FALLBACK22-NEXT:    movl 96(%esp,%edx), %esi
 ; FALLBACK22-NEXT:    leal (%esi,%esi), %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 92(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    movl 92(%esp,%edx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    addl %edi, %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %edi
 ; FALLBACK22-NEXT:    orl %eax, %edi
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    movl 104(%esp,%ecx), %eax
+; FALLBACK22-NEXT:    movl 104(%esp,%edx), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%eax,%eax), %edi
 ; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    movl 100(%esp,%ecx), %edi
-; FALLBACK22-NEXT:    shrxl %edx, %edi, %ebp
+; FALLBACK22-NEXT:    movl 100(%esp,%edx), %edi
+; FALLBACK22-NEXT:    shrxl %ecx, %edi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %esi
-; FALLBACK22-NEXT:    addl %edi, %edi
-; FALLBACK22-NEXT:    shlxl %ebx, %edi, %eax
-; FALLBACK22-NEXT:    orl %esi, %eax
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    shrxl %ecx, %esi, %esi
 ; FALLBACK22-NEXT:    movl %ecx, %eax
-; FALLBACK22-NEXT:    movl 112(%esp,%ecx), %ecx
+; FALLBACK22-NEXT:    addl %edi, %edi
+; FALLBACK22-NEXT:    shlxl %ebx, %edi, %ecx
+; FALLBACK22-NEXT:    orl %esi, %ecx
+; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 112(%esp,%edx), %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal (%ecx,%ecx), %esi
 ; FALLBACK22-NEXT:    shlxl %ebx, %esi, %ecx
-; FALLBACK22-NEXT:    movl 108(%esp,%eax), %esi
-; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, %esi, %ebp
+; FALLBACK22-NEXT:    movl 108(%esp,%edx), %esi
+; FALLBACK22-NEXT:    shrxl %eax, %esi, %ebp
 ; FALLBACK22-NEXT:    orl %ebp, %ecx
 ; FALLBACK22-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %eax, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
+; FALLBACK22-NEXT:    movl %eax, %ebp
 ; FALLBACK22-NEXT:    addl %esi, %esi
-; FALLBACK22-NEXT:    shlxl %ebx, %esi, %esi
-; FALLBACK22-NEXT:    orl %ecx, %esi
-; FALLBACK22-NEXT:    movl 120(%esp,%eax), %ebp
-; FALLBACK22-NEXT:    leal (%ebp,%ebp), %ecx
+; FALLBACK22-NEXT:    shlxl %ebx, %esi, %eax
+; FALLBACK22-NEXT:    orl %ecx, %eax
+; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK22-NEXT:    movl 120(%esp,%edx), %esi
+; FALLBACK22-NEXT:    leal (%esi,%esi), %ecx
 ; FALLBACK22-NEXT:    shlxl %ebx, %ecx, %ecx
-; FALLBACK22-NEXT:    movl 116(%esp,%eax), %eax
-; FALLBACK22-NEXT:    shrxl %edx, %eax, %edi
+; FALLBACK22-NEXT:    movl 116(%esp,%edx), %eax
+; FALLBACK22-NEXT:    shrxl %ebp, %eax, %edi
 ; FALLBACK22-NEXT:    orl %edi, %ecx
-; FALLBACK22-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; FALLBACK22-NEXT:    shrxl %ebp, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
 ; FALLBACK22-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    addl %eax, %eax
 ; FALLBACK22-NEXT:    shlxl %ebx, %eax, %edi
 ; FALLBACK22-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %eax
-; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK22-NEXT:    movl 124(%esp,%ebp), %ebp
-; FALLBACK22-NEXT:    shrxl %edx, %ebp, %edx
-; FALLBACK22-NEXT:    addl %ebp, %ebp
-; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %ebx
-; FALLBACK22-NEXT:    orl %eax, %ebx
+; FALLBACK22-NEXT:    shrxl %ebp, %esi, %eax
+; FALLBACK22-NEXT:    movl 124(%esp,%edx), %edx
+; FALLBACK22-NEXT:    shrxl %ebp, %edx, %ebp
+; FALLBACK22-NEXT:    addl %edx, %edx
+; FALLBACK22-NEXT:    shlxl %ebx, %edx, %edx
+; FALLBACK22-NEXT:    orl %eax, %edx
 ; FALLBACK22-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK22-NEXT:    movl %edx, 60(%eax)
-; FALLBACK22-NEXT:    movl %ebx, 56(%eax)
+; FALLBACK22-NEXT:    movl %ebp, 60(%eax)
+; FALLBACK22-NEXT:    movl %edx, 56(%eax)
 ; FALLBACK22-NEXT:    movl %edi, 48(%eax)
 ; FALLBACK22-NEXT:    movl %ecx, 52(%eax)
-; FALLBACK22-NEXT:    movl %esi, 40(%eax)
+; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK22-NEXT:    movl %ecx, 40(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK22-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK22-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -14464,7 +14463,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    shll $3, %ecx
-; FALLBACK23-NEXT:    andl $24, %ecx
 ; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
@@ -14567,105 +14565,108 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %ecx, %esi
 ; FALLBACK24-NEXT:    andl $60, %esi
-; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK24-NEXT:    movl 68(%esp,%esi), %ebx
 ; FALLBACK24-NEXT:    shll $3, %ecx
-; FALLBACK24-NEXT:    andl $24, %ecx
-; FALLBACK24-NEXT:    movl %edx, %edi
+; FALLBACK24-NEXT:    movl %ebx, %edi
 ; FALLBACK24-NEXT:    shrl %cl, %edi
 ; FALLBACK24-NEXT:    movl 72(%esp,%esi), %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%eax,%eax), %ebx
-; FALLBACK24-NEXT:    movl %ecx, %ebp
-; FALLBACK24-NEXT:    movb %cl, %ch
-; FALLBACK24-NEXT:    notb %ch
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    movl %ecx, %edx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    andb $24, %dl
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK24-NEXT:    movl %ebp, %eax
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %edx, %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %edi, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK24-NEXT:    movl %edx, %ebp
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    addl %edx, %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    addl %eax, %eax
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
 ; FALLBACK24-NEXT:    addl %ebx, %ebx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
 ; FALLBACK24-NEXT:    orl %edi, %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl 76(%esp,%esi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    addl %ebx, %ebx
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl 80(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %eax, %ebx
+; FALLBACK24-NEXT:    orl %ebp, %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK24-NEXT:    addl %ebx, %ebx
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
 ; FALLBACK24-NEXT:    orl %edi, %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 88(%esp,%esi), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebp, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebp, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 96(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebp, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    movl 104(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebp, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    addl %edi, %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
 ; FALLBACK24-NEXT:    movl %edi, %ebp
 ; FALLBACK24-NEXT:    movl %eax, %ecx
@@ -14673,52 +14674,55 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
 ; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
 ; FALLBACK24-NEXT:    orl %ebp, %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %ebx
 ; FALLBACK24-NEXT:    addl %edi, %edi
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    orl %edx, %edi
-; FALLBACK24-NEXT:    movl %esi, %edx
+; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    movl %esi, %ebp
 ; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
 ; FALLBACK24-NEXT:    movl %esi, %ebx
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    orl %ebx, %ebp
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl 120(%esp,%ebp), %ebp
+; FALLBACK24-NEXT:    leal (%ebp,%ebp), %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %eax
+; FALLBACK24-NEXT:    orl %ebx, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %cl, %ebx
 ; FALLBACK24-NEXT:    addl %esi, %esi
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %esi
 ; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %eax, %edx
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 124(%esp,%ecx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %eax, %ebp
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK24-NEXT:    shrl %cl, %ebx
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK24-NEXT:    movl %edx, 56(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK24-NEXT:    movl %esi, 48(%eax)
-; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK24-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
@@ -14767,91 +14771,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK25-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; FALLBACK25-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK25-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT:    movl %ecx, %ebp
-; FALLBACK25-NEXT:    andl $60, %ebp
-; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shll $3, %ecx
-; FALLBACK25-NEXT:    andl $24, %ecx
-; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %esi
-; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %ecx, %eax
+; FALLBACK25-NEXT:    andl $60, %eax
+; FALLBACK25-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK25-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shll $3, %ecx
 ; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK25-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edx
-; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK25-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edx
-; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl %esi, %edx
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edi
-; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edi
 ; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl %edi, %esi
 ; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK25-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK25-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK25-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl %esi, 56(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %ebp
 ; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK25-NEXT:    shrl %cl, %eax
-; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK25-NEXT:    movl %ebx, (%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK25-NEXT:    shrl %cl, %edx
+; FALLBACK25-NEXT:    movl %edx, 60(%eax)
+; FALLBACK25-NEXT:    movl %edi, 48(%eax)
+; FALLBACK25-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK25-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK25-NEXT:    movl %ebp, (%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK25-NEXT:    addl $188, %esp
 ; FALLBACK25-NEXT:    popl %esi
 ; FALLBACK25-NEXT:    popl %edi
@@ -14878,13 +14881,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    leal (,%ecx,8), %edx
-; FALLBACK26-NEXT:    andl $24, %edx
 ; FALLBACK26-NEXT:    andl $60, %ecx
 ; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %edi
 ; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    andb $24, %bl
 ; FALLBACK26-NEXT:    notb %bl
 ; FALLBACK26-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %ebp
@@ -15035,7 +15038,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK27-NEXT:    shll $3, %ecx
-; FALLBACK27-NEXT:    andl $24, %ecx
 ; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
@@ -15136,105 +15138,108 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %ecx, %esi
 ; FALLBACK28-NEXT:    andl $60, %esi
-; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
+; FALLBACK28-NEXT:    movl 68(%esp,%esi), %ebx
 ; FALLBACK28-NEXT:    shll $3, %ecx
-; FALLBACK28-NEXT:    andl $24, %ecx
-; FALLBACK28-NEXT:    movl %edx, %edi
+; FALLBACK28-NEXT:    movl %ebx, %edi
 ; FALLBACK28-NEXT:    shrl %cl, %edi
 ; FALLBACK28-NEXT:    movl 72(%esp,%esi), %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%eax,%eax), %ebx
-; FALLBACK28-NEXT:    movl %ecx, %ebp
-; FALLBACK28-NEXT:    movb %cl, %ch
-; FALLBACK28-NEXT:    notb %ch
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    movl %ecx, %edx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    andb $24, %dl
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK28-NEXT:    movl %ebp, %eax
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %edx, %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %edi, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK28-NEXT:    movl %edx, %ebp
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    addl %edx, %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    addl %eax, %eax
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
 ; FALLBACK28-NEXT:    addl %ebx, %ebx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
 ; FALLBACK28-NEXT:    orl %edi, %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl 76(%esp,%esi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    addl %ebx, %ebx
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl 80(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %eax, %ebx
+; FALLBACK28-NEXT:    orl %ebp, %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK28-NEXT:    addl %ebx, %ebx
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
 ; FALLBACK28-NEXT:    orl %edi, %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 88(%esp,%esi), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebp, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebp, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 96(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebp, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    movl 104(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebp, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    addl %edi, %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
 ; FALLBACK28-NEXT:    movl %edi, %ebp
 ; FALLBACK28-NEXT:    movl %eax, %ecx
@@ -15242,52 +15247,55 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
 ; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
 ; FALLBACK28-NEXT:    orl %ebp, %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %ebx
 ; FALLBACK28-NEXT:    addl %edi, %edi
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    orl %edx, %edi
-; FALLBACK28-NEXT:    movl %esi, %edx
+; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    movl %esi, %ebp
 ; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
 ; FALLBACK28-NEXT:    movl %esi, %ebx
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    orl %ebx, %ebp
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl 120(%esp,%ebp), %ebp
+; FALLBACK28-NEXT:    leal (%ebp,%ebp), %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %eax
+; FALLBACK28-NEXT:    orl %ebx, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %cl, %ebx
 ; FALLBACK28-NEXT:    addl %esi, %esi
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %esi
 ; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %eax, %edx
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 124(%esp,%ecx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %eax, %ebp
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK28-NEXT:    shrl %cl, %ebx
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK28-NEXT:    movl %edx, 56(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK28-NEXT:    movl %esi, 48(%eax)
-; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK28-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
@@ -15333,91 +15341,90 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK29-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK29-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK29-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT:    movl %ecx, %ebp
-; FALLBACK29-NEXT:    andl $60, %ebp
-; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shll $3, %ecx
-; FALLBACK29-NEXT:    andl $24, %ecx
-; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %esi
-; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %ecx, %eax
+; FALLBACK29-NEXT:    andl $60, %eax
+; FALLBACK29-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK29-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shll $3, %ecx
 ; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK29-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edx
-; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK29-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edx
-; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl %esi, %edx
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edi
-; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edi
 ; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl %edi, %esi
 ; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK29-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK29-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK29-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl %esi, 56(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %ebp
 ; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK29-NEXT:    shrl %cl, %eax
-; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK29-NEXT:    movl %ebx, (%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK29-NEXT:    shrl %cl, %edx
+; FALLBACK29-NEXT:    movl %edx, 60(%eax)
+; FALLBACK29-NEXT:    movl %edi, 48(%eax)
+; FALLBACK29-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK29-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK29-NEXT:    movl %ebp, (%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK29-NEXT:    addl $188, %esp
 ; FALLBACK29-NEXT:    popl %esi
 ; FALLBACK29-NEXT:    popl %edi
@@ -15441,13 +15448,13 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    leal (,%edx,8), %ecx
-; FALLBACK30-NEXT:    andl $24, %ecx
 ; FALLBACK30-NEXT:    andl $60, %edx
 ; FALLBACK30-NEXT:    movl 68(%esp,%edx), %esi
 ; FALLBACK30-NEXT:    movl 72(%esp,%edx), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    shrxl %ecx, %esi, %edi
 ; FALLBACK30-NEXT:    movl %ecx, %ebx
+; FALLBACK30-NEXT:    andb $24, %bl
 ; FALLBACK30-NEXT:    notb %bl
 ; FALLBACK30-NEXT:    leal (%eax,%eax), %ebp
 ; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %ebp
@@ -15595,7 +15602,6 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK31-NEXT:    shll $3, %ecx
-; FALLBACK31-NEXT:    andl $24, %ecx
 ; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
@@ -16056,7 +16062,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK0-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK0-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; FALLBACK0-NEXT:    leal (,%rsi,8), %eax
-; FALLBACK0-NEXT:    andl $56, %eax
 ; FALLBACK0-NEXT:    andl $56, %esi
 ; FALLBACK0-NEXT:    negl %esi
 ; FALLBACK0-NEXT:    movslq %esi, %rbx
@@ -16066,6 +16071,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK0-NEXT:    movl %eax, %ecx
 ; FALLBACK0-NEXT:    shlq %cl, %r10
 ; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    andb $56, %sil
 ; FALLBACK0-NEXT:    notb %sil
 ; FALLBACK0-NEXT:    movq %r8, %r9
 ; FALLBACK0-NEXT:    shrq %r9
@@ -16164,7 +16170,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK1-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK1-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; FALLBACK1-NEXT:    leal (,%rsi,8), %ecx
-; FALLBACK1-NEXT:    andl $56, %ecx
 ; FALLBACK1-NEXT:    andl $56, %esi
 ; FALLBACK1-NEXT:    negl %esi
 ; FALLBACK1-NEXT:    movslq %esi, %r9
@@ -16232,27 +16237,27 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rsi,8), %eax
-; FALLBACK2-NEXT:    andl $56, %eax
 ; FALLBACK2-NEXT:    andl $56, %esi
 ; FALLBACK2-NEXT:    negl %esi
 ; FALLBACK2-NEXT:    movslq %esi, %rsi
 ; FALLBACK2-NEXT:    movq -64(%rsp,%rsi), %r10
 ; FALLBACK2-NEXT:    movq -56(%rsp,%rsi), %rcx
-; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r9
+; FALLBACK2-NEXT:    shlxq %rax, %rcx, %r8
 ; FALLBACK2-NEXT:    movq -40(%rsp,%rsi), %rdi
 ; FALLBACK2-NEXT:    shlxq %rax, %rdi, %r11
 ; FALLBACK2-NEXT:    movq -48(%rsp,%rsi), %r14
 ; FALLBACK2-NEXT:    shlxq %rax, %r14, %rbx
-; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %r8
-; FALLBACK2-NEXT:    shlxq %rax, %r8, %r15
+; FALLBACK2-NEXT:    movq -24(%rsp,%rsi), %r9
+; FALLBACK2-NEXT:    shlxq %rax, %r9, %r15
 ; FALLBACK2-NEXT:    shlxq %rax, %r10, %r12
 ; FALLBACK2-NEXT:    movl %eax, %r13d
+; FALLBACK2-NEXT:    andb $56, %r13b
 ; FALLBACK2-NEXT:    notb %r13b
 ; FALLBACK2-NEXT:    shrq %r10
 ; FALLBACK2-NEXT:    shrxq %r13, %r10, %r10
-; FALLBACK2-NEXT:    orq %r9, %r10
-; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r9
-; FALLBACK2-NEXT:    shlxq %rax, %r9, %rbp
+; FALLBACK2-NEXT:    orq %r8, %r10
+; FALLBACK2-NEXT:    movq -32(%rsp,%rsi), %r8
+; FALLBACK2-NEXT:    shlxq %rax, %r8, %rbp
 ; FALLBACK2-NEXT:    shrq %r14
 ; FALLBACK2-NEXT:    shrxq %r13, %r14, %r14
 ; FALLBACK2-NEXT:    orq %r11, %r14
@@ -16262,23 +16267,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    shrq %rcx
 ; FALLBACK2-NEXT:    shrxq %r13, %rcx, %rcx
 ; FALLBACK2-NEXT:    orq %rbx, %rcx
-; FALLBACK2-NEXT:    shrq %r9
-; FALLBACK2-NEXT:    shrxq %r13, %r9, %r9
-; FALLBACK2-NEXT:    orq %r15, %r9
+; FALLBACK2-NEXT:    shrq %r8
+; FALLBACK2-NEXT:    shrxq %r13, %r8, %r8
+; FALLBACK2-NEXT:    orq %r15, %r8
 ; FALLBACK2-NEXT:    shrq %rdi
 ; FALLBACK2-NEXT:    shrxq %r13, %rdi, %rdi
 ; FALLBACK2-NEXT:    orq %rbp, %rdi
 ; FALLBACK2-NEXT:    shrq %rsi
 ; FALLBACK2-NEXT:    shrxq %r13, %rsi, %rsi
 ; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    shrq %r8
-; FALLBACK2-NEXT:    shrxq %r13, %r8, %r8
-; FALLBACK2-NEXT:    orq %rax, %r8
+; FALLBACK2-NEXT:    shrq %r9
+; FALLBACK2-NEXT:    shrxq %r13, %r9, %r9
+; FALLBACK2-NEXT:    orq %rax, %r9
 ; FALLBACK2-NEXT:    movq %r12, (%rdx)
-; FALLBACK2-NEXT:    movq %r8, 48(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 48(%rdx)
 ; FALLBACK2-NEXT:    movq %rsi, 56(%rdx)
 ; FALLBACK2-NEXT:    movq %rdi, 32(%rdx)
-; FALLBACK2-NEXT:    movq %r9, 40(%rdx)
+; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
 ; FALLBACK2-NEXT:    movq %rcx, 16(%rdx)
 ; FALLBACK2-NEXT:    movq %r14, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
@@ -16319,7 +16324,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK3-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK3-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; FALLBACK3-NEXT:    leal (,%rsi,8), %ecx
-; FALLBACK3-NEXT:    andl $56, %ecx
 ; FALLBACK3-NEXT:    andl $56, %esi
 ; FALLBACK3-NEXT:    negl %esi
 ; FALLBACK3-NEXT:    movslq %esi, %r8
@@ -16376,7 +16380,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK4-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK4-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK4-NEXT:    leal (,%rcx,8), %eax
-; FALLBACK4-NEXT:    andl $56, %eax
 ; FALLBACK4-NEXT:    andl $56, %ecx
 ; FALLBACK4-NEXT:    negl %ecx
 ; FALLBACK4-NEXT:    movslq %ecx, %r9
@@ -16385,6 +16388,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK4-NEXT:    movl %eax, %ecx
 ; FALLBACK4-NEXT:    shlq %cl, %r10
 ; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    andb $56, %sil
 ; FALLBACK4-NEXT:    notb %sil
 ; FALLBACK4-NEXT:    movq -32(%rsp,%r9), %r11
 ; FALLBACK4-NEXT:    movq %r11, %r8
@@ -16476,7 +16480,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK5-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK5-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK5-NEXT:    andl $56, %ecx
 ; FALLBACK5-NEXT:    andl $56, %eax
 ; FALLBACK5-NEXT:    negl %eax
 ; FALLBACK5-NEXT:    movslq %eax, %r8
@@ -16537,14 +16540,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK6-NEXT:    andl $56, %ecx
 ; FALLBACK6-NEXT:    andl $56, %eax
 ; FALLBACK6-NEXT:    negl %eax
 ; FALLBACK6-NEXT:    movslq %eax, %rsi
 ; FALLBACK6-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK6-NEXT:    shlxq %rcx, %rax, %r12
+; FALLBACK6-NEXT:    shlxq %rcx, %rax, %r15
 ; FALLBACK6-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r15
+; FALLBACK6-NEXT:    shlxq %rcx, %rdi, %r12
 ; FALLBACK6-NEXT:    movq -24(%rsp,%rsi), %r13
 ; FALLBACK6-NEXT:    shlxq %rcx, %r13, %r8
 ; FALLBACK6-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -16553,16 +16555,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq -40(%rsp,%rsi), %r14
 ; FALLBACK6-NEXT:    shlxq %rcx, %r14, %rbx
 ; FALLBACK6-NEXT:    movl %ecx, %r9d
+; FALLBACK6-NEXT:    andb $56, %r9b
 ; FALLBACK6-NEXT:    notb %r9b
 ; FALLBACK6-NEXT:    shrq %rdi
 ; FALLBACK6-NEXT:    shrxq %r9, %rdi, %rdi
-; FALLBACK6-NEXT:    orq %r12, %rdi
+; FALLBACK6-NEXT:    orq %r15, %rdi
 ; FALLBACK6-NEXT:    movq (%rsp,%rsi), %rbp
 ; FALLBACK6-NEXT:    shlxq %rcx, %rbp, %r8
 ; FALLBACK6-NEXT:    shrq %r13
-; FALLBACK6-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK6-NEXT:    orq %r15, %r12
-; FALLBACK6-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK6-NEXT:    shrxq %r9, %r13, %r15
+; FALLBACK6-NEXT:    orq %r12, %r15
+; FALLBACK6-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r12
 ; FALLBACK6-NEXT:    movq -48(%rsp,%rsi), %rsi
 ; FALLBACK6-NEXT:    shlxq %rcx, %rsi, %rcx
 ; FALLBACK6-NEXT:    shrq %r11
@@ -16579,14 +16582,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    orq %r8, %rax
 ; FALLBACK6-NEXT:    shrq %rbp
 ; FALLBACK6-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK6-NEXT:    orq %r15, %r8
+; FALLBACK6-NEXT:    orq %r12, %r8
 ; FALLBACK6-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK6-NEXT:    movq %r8, 56(%rdx)
 ; FALLBACK6-NEXT:    movq %rax, 48(%rdx)
 ; FALLBACK6-NEXT:    movq %rsi, 8(%rdx)
 ; FALLBACK6-NEXT:    movq %r14, 16(%rdx)
 ; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK6-NEXT:    movq %r12, 32(%rdx)
+; FALLBACK6-NEXT:    movq %r15, 32(%rdx)
 ; FALLBACK6-NEXT:    movq %rdi, 40(%rdx)
 ; FALLBACK6-NEXT:    addq $24, %rsp
 ; FALLBACK6-NEXT:    popq %rbx
@@ -16617,7 +16620,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK7-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK7-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK7-NEXT:    andl $56, %ecx
 ; FALLBACK7-NEXT:    andl $56, %eax
 ; FALLBACK7-NEXT:    negl %eax
 ; FALLBACK7-NEXT:    movslq %eax, %r8
@@ -16669,7 +16671,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK8-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK8-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK8-NEXT:    leal (,%rcx,8), %eax
-; FALLBACK8-NEXT:    andl $56, %eax
 ; FALLBACK8-NEXT:    andl $56, %ecx
 ; FALLBACK8-NEXT:    negl %ecx
 ; FALLBACK8-NEXT:    movslq %ecx, %r9
@@ -16678,6 +16679,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK8-NEXT:    movl %eax, %ecx
 ; FALLBACK8-NEXT:    shlq %cl, %r10
 ; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    andb $56, %sil
 ; FALLBACK8-NEXT:    notb %sil
 ; FALLBACK8-NEXT:    movq -32(%rsp,%r9), %r11
 ; FALLBACK8-NEXT:    movq %r11, %r8
@@ -16764,7 +16766,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK9-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK9-NEXT:    andl $56, %ecx
 ; FALLBACK9-NEXT:    andl $56, %eax
 ; FALLBACK9-NEXT:    negl %eax
 ; FALLBACK9-NEXT:    movslq %eax, %r8
@@ -16820,14 +16821,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK10-NEXT:    andl $56, %ecx
 ; FALLBACK10-NEXT:    andl $56, %eax
 ; FALLBACK10-NEXT:    negl %eax
 ; FALLBACK10-NEXT:    movslq %eax, %rsi
 ; FALLBACK10-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK10-NEXT:    shlxq %rcx, %rax, %r12
+; FALLBACK10-NEXT:    shlxq %rcx, %rax, %r15
 ; FALLBACK10-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r15
+; FALLBACK10-NEXT:    shlxq %rcx, %rdi, %r12
 ; FALLBACK10-NEXT:    movq -24(%rsp,%rsi), %r13
 ; FALLBACK10-NEXT:    shlxq %rcx, %r13, %r8
 ; FALLBACK10-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -16836,16 +16836,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq -40(%rsp,%rsi), %r14
 ; FALLBACK10-NEXT:    shlxq %rcx, %r14, %rbx
 ; FALLBACK10-NEXT:    movl %ecx, %r9d
+; FALLBACK10-NEXT:    andb $56, %r9b
 ; FALLBACK10-NEXT:    notb %r9b
 ; FALLBACK10-NEXT:    shrq %rdi
 ; FALLBACK10-NEXT:    shrxq %r9, %rdi, %rdi
-; FALLBACK10-NEXT:    orq %r12, %rdi
+; FALLBACK10-NEXT:    orq %r15, %rdi
 ; FALLBACK10-NEXT:    movq (%rsp,%rsi), %rbp
 ; FALLBACK10-NEXT:    shlxq %rcx, %rbp, %r8
 ; FALLBACK10-NEXT:    shrq %r13
-; FALLBACK10-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK10-NEXT:    orq %r15, %r12
-; FALLBACK10-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK10-NEXT:    shrxq %r9, %r13, %r15
+; FALLBACK10-NEXT:    orq %r12, %r15
+; FALLBACK10-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r12
 ; FALLBACK10-NEXT:    movq -48(%rsp,%rsi), %rsi
 ; FALLBACK10-NEXT:    shlxq %rcx, %rsi, %rcx
 ; FALLBACK10-NEXT:    shrq %r11
@@ -16862,14 +16863,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    orq %r8, %rax
 ; FALLBACK10-NEXT:    shrq %rbp
 ; FALLBACK10-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK10-NEXT:    orq %r15, %r8
+; FALLBACK10-NEXT:    orq %r12, %r8
 ; FALLBACK10-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK10-NEXT:    movq %r8, 56(%rdx)
 ; FALLBACK10-NEXT:    movq %rax, 48(%rdx)
 ; FALLBACK10-NEXT:    movq %rsi, 8(%rdx)
 ; FALLBACK10-NEXT:    movq %r14, 16(%rdx)
 ; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK10-NEXT:    movq %r12, 32(%rdx)
+; FALLBACK10-NEXT:    movq %r15, 32(%rdx)
 ; FALLBACK10-NEXT:    movq %rdi, 40(%rdx)
 ; FALLBACK10-NEXT:    addq $24, %rsp
 ; FALLBACK10-NEXT:    popq %rbx
@@ -16895,7 +16896,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK11-NEXT:    vmovups %ymm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK11-NEXT:    vmovups %ymm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK11-NEXT:    andl $56, %ecx
 ; FALLBACK11-NEXT:    andl $56, %eax
 ; FALLBACK11-NEXT:    negl %eax
 ; FALLBACK11-NEXT:    movslq %eax, %r8
@@ -16945,7 +16945,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    leal (,%rcx,8), %eax
-; FALLBACK12-NEXT:    andl $56, %eax
 ; FALLBACK12-NEXT:    andl $56, %ecx
 ; FALLBACK12-NEXT:    negl %ecx
 ; FALLBACK12-NEXT:    movslq %ecx, %r9
@@ -16954,6 +16953,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shlq %cl, %r10
 ; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    andb $56, %sil
 ; FALLBACK12-NEXT:    notb %sil
 ; FALLBACK12-NEXT:    movq -32(%rsp,%r9), %r11
 ; FALLBACK12-NEXT:    movq %r11, %r8
@@ -17037,7 +17037,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK13-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK13-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK13-NEXT:    andl $56, %ecx
 ; FALLBACK13-NEXT:    andl $56, %eax
 ; FALLBACK13-NEXT:    negl %eax
 ; FALLBACK13-NEXT:    movslq %eax, %r8
@@ -17090,14 +17089,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK14-NEXT:    andl $56, %ecx
 ; FALLBACK14-NEXT:    andl $56, %eax
 ; FALLBACK14-NEXT:    negl %eax
 ; FALLBACK14-NEXT:    movslq %eax, %rsi
 ; FALLBACK14-NEXT:    movq -8(%rsp,%rsi), %rax
-; FALLBACK14-NEXT:    shlxq %rcx, %rax, %r12
+; FALLBACK14-NEXT:    shlxq %rcx, %rax, %r15
 ; FALLBACK14-NEXT:    movq -16(%rsp,%rsi), %rdi
-; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r15
+; FALLBACK14-NEXT:    shlxq %rcx, %rdi, %r12
 ; FALLBACK14-NEXT:    movq -24(%rsp,%rsi), %r13
 ; FALLBACK14-NEXT:    shlxq %rcx, %r13, %r8
 ; FALLBACK14-NEXT:    movq %r8, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
@@ -17106,16 +17104,17 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq -40(%rsp,%rsi), %r14
 ; FALLBACK14-NEXT:    shlxq %rcx, %r14, %rbx
 ; FALLBACK14-NEXT:    movl %ecx, %r9d
+; FALLBACK14-NEXT:    andb $56, %r9b
 ; FALLBACK14-NEXT:    notb %r9b
 ; FALLBACK14-NEXT:    shrq %rdi
 ; FALLBACK14-NEXT:    shrxq %r9, %rdi, %rdi
-; FALLBACK14-NEXT:    orq %r12, %rdi
+; FALLBACK14-NEXT:    orq %r15, %rdi
 ; FALLBACK14-NEXT:    movq (%rsp,%rsi), %rbp
 ; FALLBACK14-NEXT:    shlxq %rcx, %rbp, %r8
 ; FALLBACK14-NEXT:    shrq %r13
-; FALLBACK14-NEXT:    shrxq %r9, %r13, %r12
-; FALLBACK14-NEXT:    orq %r15, %r12
-; FALLBACK14-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r15
+; FALLBACK14-NEXT:    shrxq %r9, %r13, %r15
+; FALLBACK14-NEXT:    orq %r12, %r15
+; FALLBACK14-NEXT:    shlxq %rcx, 8(%rsp,%rsi), %r12
 ; FALLBACK14-NEXT:    movq -48(%rsp,%rsi), %rsi
 ; FALLBACK14-NEXT:    shlxq %rcx, %rsi, %rcx
 ; FALLBACK14-NEXT:    shrq %r11
@@ -17132,14 +17131,14 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    orq %r8, %rax
 ; FALLBACK14-NEXT:    shrq %rbp
 ; FALLBACK14-NEXT:    shrxq %r9, %rbp, %r8
-; FALLBACK14-NEXT:    orq %r15, %r8
+; FALLBACK14-NEXT:    orq %r12, %r8
 ; FALLBACK14-NEXT:    movq %rcx, (%rdx)
 ; FALLBACK14-NEXT:    movq %r8, 56(%rdx)
 ; FALLBACK14-NEXT:    movq %rax, 48(%rdx)
 ; FALLBACK14-NEXT:    movq %rsi, 8(%rdx)
 ; FALLBACK14-NEXT:    movq %r14, 16(%rdx)
 ; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
-; FALLBACK14-NEXT:    movq %r12, 32(%rdx)
+; FALLBACK14-NEXT:    movq %r15, 32(%rdx)
 ; FALLBACK14-NEXT:    movq %rdi, 40(%rdx)
 ; FALLBACK14-NEXT:    addq $24, %rsp
 ; FALLBACK14-NEXT:    popq %rbx
@@ -17162,7 +17161,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK15-NEXT:    vmovups %zmm1, -{{[0-9]+}}(%rsp)
 ; FALLBACK15-NEXT:    vmovups %zmm0, -{{[0-9]+}}(%rsp)
 ; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK15-NEXT:    andl $56, %ecx
 ; FALLBACK15-NEXT:    andl $56, %eax
 ; FALLBACK15-NEXT:    negl %eax
 ; FALLBACK15-NEXT:    movslq %eax, %r8
@@ -17208,7 +17206,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    subl $204, %esp
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK16-NEXT:    movl (%eax), %ecx
-; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ecx, (%esp) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl 4(%eax), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl 8(%eax), %ecx
@@ -17227,233 +17225,240 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl 36(%eax), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 40(%eax), %ebp
-; FALLBACK16-NEXT:    movl 44(%eax), %ebx
-; FALLBACK16-NEXT:    movl 48(%eax), %edi
-; FALLBACK16-NEXT:    movl 52(%eax), %esi
-; FALLBACK16-NEXT:    movl 56(%eax), %edx
-; FALLBACK16-NEXT:    movl 60(%eax), %ecx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl (%eax), %eax
+; FALLBACK16-NEXT:    movl 40(%eax), %ebx
+; FALLBACK16-NEXT:    movl 44(%eax), %edi
+; FALLBACK16-NEXT:    movl 48(%eax), %esi
+; FALLBACK16-NEXT:    movl 52(%eax), %edx
+; FALLBACK16-NEXT:    movl 56(%eax), %ecx
+; FALLBACK16-NEXT:    movl 60(%eax), %eax
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; FALLBACK16-NEXT:    movl (%ebp), %ebp
 ; FALLBACK16-NEXT:    xorps %xmm0, %xmm0
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %eax, %edx
-; FALLBACK16-NEXT:    andl $60, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK16-NEXT:    subl %edx, %ecx
-; FALLBACK16-NEXT:    movl (%ecx), %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 4(%ecx), %edx
-; FALLBACK16-NEXT:    movl %ecx, %ebp
-; FALLBACK16-NEXT:    shll $3, %eax
-; FALLBACK16-NEXT:    andl $24, %eax
-; FALLBACK16-NEXT:    movl %edx, %esi
-; FALLBACK16-NEXT:    movl %eax, %ecx
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    shrl %edi
-; FALLBACK16-NEXT:    movb %al, %ch
-; FALLBACK16-NEXT:    notb %ch
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %esi, %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 12(%ebp), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 8(%ebp), %esi
-; FALLBACK16-NEXT:    movl %ebp, %edi
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK16-NEXT:    movl %ebp, %ebx
+; FALLBACK16-NEXT:    movl %ebp, %ecx
+; FALLBACK16-NEXT:    andl $60, %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; FALLBACK16-NEXT:    subl %ecx, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl (%eax), %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 4(%eax), %edi
+; FALLBACK16-NEXT:    shll $3, %ebx
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    andb $24, %dl
+; FALLBACK16-NEXT:    notb %dl
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK16-NEXT:    movl 12(%esi), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 8(%esi), %esi
 ; FALLBACK16-NEXT:    movl %esi, %ebp
 ; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %ebx, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %esi, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %edi, %ebp
-; FALLBACK16-NEXT:    movl 20(%edi), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 16(%edi), %esi
-; FALLBACK16-NEXT:    movl %esi, %edx
-; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    movl %ebx, %ebp
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %edi
-; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %edi
 ; FALLBACK16-NEXT:    orl %esi, %edi
 ; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %ebp, %edx
-; FALLBACK16-NEXT:    movl 28(%ebp), %ebx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT:    movl 20(%edi), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 16(%edi), %esi
+; FALLBACK16-NEXT:    movl %esi, %ebx
+; FALLBACK16-NEXT:    shrl %ebx
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %ebx
+; FALLBACK16-NEXT:    orl %eax, %ebx
 ; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 24(%ebp), %esi
-; FALLBACK16-NEXT:    movl %esi, %edi
-; FALLBACK16-NEXT:    shrl %edi
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %ebx, %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %ebp, %ebx
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %esi, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 36(%edx), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl 32(%edx), %esi
-; FALLBACK16-NEXT:    movl %edx, %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 28(%edi), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 24(%edi), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %esi
+; FALLBACK16-NEXT:    shrl %esi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebp, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; FALLBACK16-NEXT:    movl 36(%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    movl 32(%ebp), %esi
 ; FALLBACK16-NEXT:    movl %esi, %edi
 ; FALLBACK16-NEXT:    shrl %edi
-; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
 ; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %esi, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 44(%ebp), %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 44(%ebp), %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    shll %cl, %eax
 ; FALLBACK16-NEXT:    movl 40(%ebp), %esi
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %esi, %edx
-; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %esi, %edi
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %eax, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %esi, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %esi, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl 52(%ebp), %esi
 ; FALLBACK16-NEXT:    movl %esi, %edi
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %ebx, %ecx
 ; FALLBACK16-NEXT:    shll %cl, %edi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    negl %edx
-; FALLBACK16-NEXT:    movl 176(%esp,%edx), %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ebp
-; FALLBACK16-NEXT:    shrl %ebp
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %ebp
-; FALLBACK16-NEXT:    orl %edi, %ebp
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    negl %eax
+; FALLBACK16-NEXT:    movl 176(%esp,%eax), %ebx
+; FALLBACK16-NEXT:    movl %ebx, %eax
+; FALLBACK16-NEXT:    shrl %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    orl %edi, %eax
+; FALLBACK16-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %edx
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    orl %ebx, %edx
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK16-NEXT:    movl 60(%edi), %edx
-; FALLBACK16-NEXT:    movb %al, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    shrl %edi
+; FALLBACK16-NEXT:    movl %edx, %ecx
+; FALLBACK16-NEXT:    shrl %cl, %edi
+; FALLBACK16-NEXT:    orl %ebx, %edi
+; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %ebp, %edi
+; FALLBACK16-NEXT:    movl 60(%ebp), %ebp
+; FALLBACK16-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
 ; FALLBACK16-NEXT:    movl 56(%edi), %ebx
 ; FALLBACK16-NEXT:    movl %ebx, %edi
 ; FALLBACK16-NEXT:    shrl %edi
-; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    orl %edx, %edi
-; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    orl %ebp, %edi
+; FALLBACK16-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK16-NEXT:    shll %cl, %ebx
 ; FALLBACK16-NEXT:    shrl %esi
-; FALLBACK16-NEXT:    movb %ch, %cl
+; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %esi
 ; FALLBACK16-NEXT:    orl %ebx, %esi
-; FALLBACK16-NEXT:    movl %eax, %ecx
+; FALLBACK16-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; FALLBACK16-NEXT:    movl %edx, (%eax)
-; FALLBACK16-NEXT:    movl %esi, 56(%eax)
-; FALLBACK16-NEXT:    movl %edi, 60(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 48(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 40(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 32(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 36(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 24(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 28(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 16(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 20(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 8(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 12(%eax)
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    movl %ecx, 4(%eax)
+; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; FALLBACK16-NEXT:    movl %edx, (%ecx)
+; FALLBACK16-NEXT:    movl %esi, 56(%ecx)
+; FALLBACK16-NEXT:    movl %edi, 60(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %edx, 48(%ecx)
+; FALLBACK16-NEXT:    movl %eax, 52(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 40(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 44(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 32(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 36(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 24(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 28(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 16(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 20(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 8(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 12(%ecx)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl %eax, 4(%ecx)
 ; FALLBACK16-NEXT:    addl $204, %esp
 ; FALLBACK16-NEXT:    popl %esi
 ; FALLBACK16-NEXT:    popl %edi
@@ -17535,7 +17540,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK17-NEXT:    movl 8(%eax), %esi
 ; FALLBACK17-NEXT:    movl 12(%eax), %edx
 ; FALLBACK17-NEXT:    shll $3, %ecx
-; FALLBACK17-NEXT:    andl $24, %ecx
 ; FALLBACK17-NEXT:    movl %edx, %edi
 ; FALLBACK17-NEXT:    shldl %cl, %esi, %edi
 ; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -17688,7 +17692,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK18-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    leal (,%ebp,8), %edx
-; FALLBACK18-NEXT:    andl $24, %edx
 ; FALLBACK18-NEXT:    andl $60, %ebp
 ; FALLBACK18-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    leal {{[0-9]+}}(%esp), %edi
@@ -17698,6 +17701,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl 4(%edi), %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    andb $24, %bl
 ; FALLBACK18-NEXT:    notb %bl
 ; FALLBACK18-NEXT:    shrl %ecx
 ; FALLBACK18-NEXT:    shrxl %ebx, %ecx, %esi
@@ -17900,7 +17904,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK19-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK19-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK19-NEXT:    leal (,%ebp,8), %ecx
-; FALLBACK19-NEXT:    andl $24, %ecx
 ; FALLBACK19-NEXT:    andl $60, %ebp
 ; FALLBACK19-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; FALLBACK19-NEXT:    subl %ebp, %eax
@@ -18005,7 +18008,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK20-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK20-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK20-NEXT:    movl (%eax), %eax
+; FALLBACK20-NEXT:    movl (%eax), %ecx
 ; FALLBACK20-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -18015,160 +18018,173 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    andl $60, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK20-NEXT:    subl %edx, %ecx
-; FALLBACK20-NEXT:    movl (%ecx), %edi
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    andl $60, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK20-NEXT:    subl %eax, %edx
+; FALLBACK20-NEXT:    movl (%edx), %edi
 ; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 4(%ecx), %edx
-; FALLBACK20-NEXT:    movl %ecx, %ebp
-; FALLBACK20-NEXT:    shll $3, %eax
-; FALLBACK20-NEXT:    andl $24, %eax
-; FALLBACK20-NEXT:    movl %edx, %esi
-; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl 4(%edx), %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %edx, %ebx
+; FALLBACK20-NEXT:    shll $3, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %esi
 ; FALLBACK20-NEXT:    shrl %edi
-; FALLBACK20-NEXT:    movb %al, %ch
-; FALLBACK20-NEXT:    notb %ch
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %ecx, %edx
+; FALLBACK20-NEXT:    movl %ecx, %eax
+; FALLBACK20-NEXT:    andb $24, %dl
+; FALLBACK20-NEXT:    notb %dl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
 ; FALLBACK20-NEXT:    orl %esi, %edi
 ; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK20-NEXT:    movl %ebx, %esi
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl 12(%ebx), %ebx
+; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 8(%ebp), %esi
-; FALLBACK20-NEXT:    movl %ebp, %edi
+; FALLBACK20-NEXT:    movl 8(%esi), %esi
 ; FALLBACK20-NEXT:    movl %esi, %ebp
 ; FALLBACK20-NEXT:    shrl %ebp
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %ebp
 ; FALLBACK20-NEXT:    orl %ebx, %ebp
 ; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl %eax, %edi
 ; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %esi, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %edi, %ebp
-; FALLBACK20-NEXT:    movl 20(%edi), %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK20-NEXT:    movl 20(%esi), %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 16(%edi), %esi
-; FALLBACK20-NEXT:    movl %esi, %edx
-; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl 16(%esi), %esi
+; FALLBACK20-NEXT:    movl %esi, %ebp
+; FALLBACK20-NEXT:    shrl %ebp
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %ebp
+; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %esi
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %edi
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
 ; FALLBACK20-NEXT:    orl %esi, %edi
 ; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %ebp, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; FALLBACK20-NEXT:    movl 28(%ebp), %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    movl %eax, %edi
 ; FALLBACK20-NEXT:    shll %cl, %ebx
 ; FALLBACK20-NEXT:    movl 24(%ebp), %esi
-; FALLBACK20-NEXT:    movl %esi, %edi
-; FALLBACK20-NEXT:    shrl %edi
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    orl %ebx, %edi
-; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %ebx, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %edi, %ecx
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %ebp
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    orl %esi, %ebp
-; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 36(%edx), %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %esi, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 36(%ebp), %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 32(%edx), %esi
-; FALLBACK20-NEXT:    movl %edx, %ebp
-; FALLBACK20-NEXT:    movl %esi, %edi
+; FALLBACK20-NEXT:    movl 32(%ebp), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %esi
+; FALLBACK20-NEXT:    shrl %esi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    orl %ebx, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %eax, %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK20-NEXT:    shrl %edi
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    orl %ebx, %edi
+; FALLBACK20-NEXT:    orl %ebp, %edi
 ; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %esi, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl 44(%eax), %ebx
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl 40(%ebp), %esi
-; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl %esi, %edx
-; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl 40(%eax), %esi
+; FALLBACK20-NEXT:    movl %eax, %ebp
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %ebx, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %esi, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %edi
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %edi
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl 52(%ebp), %esi
 ; FALLBACK20-NEXT:    movl %esi, %edi
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    negl %edx
-; FALLBACK20-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    negl %eax
+; FALLBACK20-NEXT:    movl 176(%esp,%eax), %ebx
 ; FALLBACK20-NEXT:    movl %ebx, %ebp
 ; FALLBACK20-NEXT:    shrl %ebp
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %ebp
 ; FALLBACK20-NEXT:    orl %edi, %ebp
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %edx
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    orl %ebx, %eax
+; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK20-NEXT:    movl 60(%edi), %edx
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
+; FALLBACK20-NEXT:    movl 60(%edi), %eax
+; FALLBACK20-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT:    shll %cl, %eax
 ; FALLBACK20-NEXT:    movl 56(%edi), %ebx
 ; FALLBACK20-NEXT:    movl %ebx, %edi
 ; FALLBACK20-NEXT:    shrl %edi
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    orl %edx, %edi
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    orl %eax, %edi
+; FALLBACK20-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    shll %cl, %ebx
 ; FALLBACK20-NEXT:    shrl %esi
-; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    movl %edx, %ecx
 ; FALLBACK20-NEXT:    shrl %cl, %esi
 ; FALLBACK20-NEXT:    orl %ebx, %esi
 ; FALLBACK20-NEXT:    movl %eax, %ecx
@@ -18240,7 +18256,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK21-NEXT:    movl 8(%eax), %esi
 ; FALLBACK21-NEXT:    movl 12(%eax), %edx
 ; FALLBACK21-NEXT:    shll $3, %ecx
-; FALLBACK21-NEXT:    andl $24, %ecx
 ; FALLBACK21-NEXT:    movl %edx, %edi
 ; FALLBACK21-NEXT:    shldl %cl, %esi, %edi
 ; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -18349,7 +18364,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    leal (,%eax,8), %edx
-; FALLBACK22-NEXT:    andl $24, %edx
 ; FALLBACK22-NEXT:    andl $60, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    leal {{[0-9]+}}(%esp), %edi
@@ -18359,6 +18373,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl 4(%edi), %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    andb $24, %bl
 ; FALLBACK22-NEXT:    notb %bl
 ; FALLBACK22-NEXT:    shrl %ecx
 ; FALLBACK22-NEXT:    shrxl %ebx, %ecx, %esi
@@ -18506,7 +18521,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK23-NEXT:    movups 16(%ecx), %xmm1
 ; FALLBACK23-NEXT:    movups 32(%ecx), %xmm2
 ; FALLBACK23-NEXT:    movups 48(%ecx), %xmm3
-; FALLBACK23-NEXT:    movl (%eax), %ebp
+; FALLBACK23-NEXT:    movl (%eax), %ebx
 ; FALLBACK23-NEXT:    xorps %xmm4, %xmm4
 ; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
 ; FALLBACK23-NEXT:    movaps %xmm4, {{[0-9]+}}(%esp)
@@ -18516,47 +18531,46 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK23-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; FALLBACK23-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK23-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; FALLBACK23-NEXT:    leal (,%ebp,8), %ecx
-; FALLBACK23-NEXT:    andl $24, %ecx
-; FALLBACK23-NEXT:    andl $60, %ebp
+; FALLBACK23-NEXT:    leal (,%ebx,8), %ecx
+; FALLBACK23-NEXT:    andl $60, %ebx
 ; FALLBACK23-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; FALLBACK23-NEXT:    subl %ebp, %eax
+; FALLBACK23-NEXT:    subl %ebx, %eax
 ; FALLBACK23-NEXT:    movl 4(%eax), %esi
 ; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl 8(%eax), %edi
 ; FALLBACK23-NEXT:    movl 12(%eax), %edx
-; FALLBACK23-NEXT:    movl %edx, %ebx
-; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl %edx, %ebp
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK23-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
 ; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl 16(%eax), %edi
 ; FALLBACK23-NEXT:    movl 20(%eax), %esi
-; FALLBACK23-NEXT:    movl %esi, %ebx
-; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl %esi, %ebp
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK23-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
 ; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl 24(%eax), %edi
 ; FALLBACK23-NEXT:    movl 28(%eax), %edx
-; FALLBACK23-NEXT:    movl %edx, %ebx
-; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl %edx, %ebp
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK23-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    shldl %cl, %esi, %edi
 ; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl 32(%eax), %edi
 ; FALLBACK23-NEXT:    movl 36(%eax), %esi
-; FALLBACK23-NEXT:    movl %esi, %ebx
-; FALLBACK23-NEXT:    shldl %cl, %edi, %ebx
-; FALLBACK23-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK23-NEXT:    movl %esi, %ebp
+; FALLBACK23-NEXT:    shldl %cl, %edi, %ebp
+; FALLBACK23-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
 ; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT:    movl 40(%eax), %ebx
+; FALLBACK23-NEXT:    movl 40(%eax), %ebp
 ; FALLBACK23-NEXT:    movl 44(%eax), %edx
 ; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT:    shldl %cl, %ebx, %edx
+; FALLBACK23-NEXT:    shldl %cl, %ebp, %edx
 ; FALLBACK23-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK23-NEXT:    shldl %cl, %esi, %ebx
+; FALLBACK23-NEXT:    shldl %cl, %esi, %ebp
 ; FALLBACK23-NEXT:    movl 56(%eax), %edx
 ; FALLBACK23-NEXT:    movl 60(%eax), %edi
 ; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
@@ -18564,8 +18578,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK23-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl 52(%eax), %esi
 ; FALLBACK23-NEXT:    shldl %cl, %esi, %edx
-; FALLBACK23-NEXT:    negl %ebp
-; FALLBACK23-NEXT:    movl 176(%esp,%ebp), %ebp
+; FALLBACK23-NEXT:    negl %ebx
+; FALLBACK23-NEXT:    movl 176(%esp,%ebx), %ebx
 ; FALLBACK23-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK23-NEXT:    movl %edx, 56(%eax)
 ; FALLBACK23-NEXT:    movl %edi, 60(%eax)
@@ -18574,13 +18588,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK23-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK23-NEXT:    shldl %cl, %edx, %edi
-; FALLBACK23-NEXT:    shldl %cl, %ebp, %esi
+; FALLBACK23-NEXT:    shldl %cl, %ebx, %esi
 ; FALLBACK23-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK23-NEXT:    shldl %cl, %edx, %ebp
-; FALLBACK23-NEXT:    movl %ebp, 48(%eax)
+; FALLBACK23-NEXT:    shldl %cl, %edx, %ebx
+; FALLBACK23-NEXT:    movl %ebx, 48(%eax)
 ; FALLBACK23-NEXT:    movl %esi, 52(%eax)
-; FALLBACK23-NEXT:    movl %ebx, 40(%eax)
+; FALLBACK23-NEXT:    movl %ebp, 40(%eax)
 ; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK23-NEXT:    movl %ecx, 44(%eax)
 ; FALLBACK23-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -18620,166 +18634,179 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
 ; FALLBACK24-NEXT:    vmovups 32(%ecx), %ymm1
-; FALLBACK24-NEXT:    movl (%eax), %eax
+; FALLBACK24-NEXT:    movl (%eax), %ecx
 ; FALLBACK24-NEXT:    vxorps %xmm2, %xmm2, %xmm2
 ; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm2, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    andl $60, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT:    subl %edx, %ecx
-; FALLBACK24-NEXT:    movl (%ecx), %edi
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    andl $60, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK24-NEXT:    subl %eax, %edx
+; FALLBACK24-NEXT:    movl (%edx), %edi
 ; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 4(%ecx), %edx
-; FALLBACK24-NEXT:    movl %ecx, %ebp
-; FALLBACK24-NEXT:    shll $3, %eax
-; FALLBACK24-NEXT:    andl $24, %eax
-; FALLBACK24-NEXT:    movl %edx, %esi
-; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl 4(%edx), %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %edx, %ebx
+; FALLBACK24-NEXT:    shll $3, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %esi
 ; FALLBACK24-NEXT:    shrl %edi
-; FALLBACK24-NEXT:    movb %al, %ch
-; FALLBACK24-NEXT:    notb %ch
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %ecx, %edx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    andb $24, %dl
+; FALLBACK24-NEXT:    notb %dl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
 ; FALLBACK24-NEXT:    orl %esi, %edi
 ; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK24-NEXT:    movl %ebx, %esi
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl 12(%ebx), %ebx
+; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 8(%ebp), %esi
-; FALLBACK24-NEXT:    movl %ebp, %edi
+; FALLBACK24-NEXT:    movl 8(%esi), %esi
 ; FALLBACK24-NEXT:    movl %esi, %ebp
 ; FALLBACK24-NEXT:    shrl %ebp
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %ebp
 ; FALLBACK24-NEXT:    orl %ebx, %ebp
 ; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl %eax, %edi
 ; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %esi, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %edi, %ebp
-; FALLBACK24-NEXT:    movl 20(%edi), %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK24-NEXT:    movl 20(%esi), %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 16(%edi), %esi
-; FALLBACK24-NEXT:    movl %esi, %edx
-; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl 16(%esi), %esi
+; FALLBACK24-NEXT:    movl %esi, %ebp
+; FALLBACK24-NEXT:    shrl %ebp
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %ebp
+; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %esi
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %edi
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
 ; FALLBACK24-NEXT:    orl %esi, %edi
 ; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %ebp, %edx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; FALLBACK24-NEXT:    movl 28(%ebp), %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    movl %eax, %edi
 ; FALLBACK24-NEXT:    shll %cl, %ebx
 ; FALLBACK24-NEXT:    movl 24(%ebp), %esi
-; FALLBACK24-NEXT:    movl %esi, %edi
-; FALLBACK24-NEXT:    shrl %edi
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    orl %ebx, %edi
-; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %ebx, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %edi, %ecx
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %ebp
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    orl %esi, %ebp
-; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 36(%edx), %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %esi, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 36(%ebp), %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 32(%edx), %esi
-; FALLBACK24-NEXT:    movl %edx, %ebp
-; FALLBACK24-NEXT:    movl %esi, %edi
+; FALLBACK24-NEXT:    movl 32(%ebp), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %esi
+; FALLBACK24-NEXT:    shrl %esi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    orl %ebx, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %eax, %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK24-NEXT:    shrl %edi
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    orl %ebx, %edi
+; FALLBACK24-NEXT:    orl %ebp, %edi
 ; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %esi, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl 44(%eax), %ebx
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl 40(%ebp), %esi
-; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl %esi, %edx
-; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl 40(%eax), %esi
+; FALLBACK24-NEXT:    movl %eax, %ebp
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %ebx, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %esi, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %edi
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %edi
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl 52(%ebp), %esi
 ; FALLBACK24-NEXT:    movl %esi, %edi
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    negl %edx
-; FALLBACK24-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    negl %eax
+; FALLBACK24-NEXT:    movl 176(%esp,%eax), %ebx
 ; FALLBACK24-NEXT:    movl %ebx, %ebp
 ; FALLBACK24-NEXT:    shrl %ebp
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %ebp
 ; FALLBACK24-NEXT:    orl %edi, %ebp
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %edx
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    orl %ebx, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK24-NEXT:    movl 60(%edi), %edx
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
+; FALLBACK24-NEXT:    movl 60(%edi), %eax
+; FALLBACK24-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT:    shll %cl, %eax
 ; FALLBACK24-NEXT:    movl 56(%edi), %ebx
 ; FALLBACK24-NEXT:    movl %ebx, %edi
 ; FALLBACK24-NEXT:    shrl %edi
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    orl %edx, %edi
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    orl %eax, %edi
+; FALLBACK24-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    shll %cl, %ebx
 ; FALLBACK24-NEXT:    shrl %esi
-; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    movl %edx, %ecx
 ; FALLBACK24-NEXT:    shrl %cl, %esi
 ; FALLBACK24-NEXT:    orl %ebx, %esi
 ; FALLBACK24-NEXT:    movl %eax, %ecx
@@ -18846,7 +18873,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK25-NEXT:    movl 8(%eax), %esi
 ; FALLBACK25-NEXT:    movl 12(%eax), %edx
 ; FALLBACK25-NEXT:    shll $3, %ecx
-; FALLBACK25-NEXT:    andl $24, %ecx
 ; FALLBACK25-NEXT:    movl %edx, %edi
 ; FALLBACK25-NEXT:    shldl %cl, %esi, %edi
 ; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -18950,7 +18976,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    leal (,%eax,8), %edx
-; FALLBACK26-NEXT:    andl $24, %edx
 ; FALLBACK26-NEXT:    andl $60, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    leal {{[0-9]+}}(%esp), %edi
@@ -18960,6 +18985,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl 4(%edi), %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    andb $24, %bl
 ; FALLBACK26-NEXT:    notb %bl
 ; FALLBACK26-NEXT:    shrl %ecx
 ; FALLBACK26-NEXT:    shrxl %ebx, %ecx, %esi
@@ -19113,7 +19139,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK27-NEXT:    vmovups %ymm1, {{[0-9]+}}(%esp)
 ; FALLBACK27-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
 ; FALLBACK27-NEXT:    leal (,%ebx,8), %ecx
-; FALLBACK27-NEXT:    andl $24, %ecx
 ; FALLBACK27-NEXT:    andl $60, %ebx
 ; FALLBACK27-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; FALLBACK27-NEXT:    subl %ebx, %eax
@@ -19216,164 +19241,177 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; FALLBACK28-NEXT:    vmovups (%ecx), %zmm0
-; FALLBACK28-NEXT:    movl (%eax), %eax
+; FALLBACK28-NEXT:    movl (%eax), %ecx
 ; FALLBACK28-NEXT:    vxorps %xmm1, %xmm1, %xmm1
 ; FALLBACK28-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    andl $60, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT:    subl %edx, %ecx
-; FALLBACK28-NEXT:    movl (%ecx), %edi
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    andl $60, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    leal {{[0-9]+}}(%esp), %edx
+; FALLBACK28-NEXT:    subl %eax, %edx
+; FALLBACK28-NEXT:    movl (%edx), %edi
 ; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 4(%ecx), %edx
-; FALLBACK28-NEXT:    movl %ecx, %ebp
-; FALLBACK28-NEXT:    shll $3, %eax
-; FALLBACK28-NEXT:    andl $24, %eax
-; FALLBACK28-NEXT:    movl %edx, %esi
-; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl 4(%edx), %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %edx, %ebx
+; FALLBACK28-NEXT:    shll $3, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %esi
 ; FALLBACK28-NEXT:    shrl %edi
-; FALLBACK28-NEXT:    movb %al, %ch
-; FALLBACK28-NEXT:    notb %ch
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %ecx, %edx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    andb $24, %dl
+; FALLBACK28-NEXT:    notb %dl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
 ; FALLBACK28-NEXT:    orl %esi, %edi
 ; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 12(%ebp), %ebx
+; FALLBACK28-NEXT:    movl %ebx, %esi
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl 12(%ebx), %ebx
+; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 8(%ebp), %esi
-; FALLBACK28-NEXT:    movl %ebp, %edi
+; FALLBACK28-NEXT:    movl 8(%esi), %esi
 ; FALLBACK28-NEXT:    movl %esi, %ebp
 ; FALLBACK28-NEXT:    shrl %ebp
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %ebp
 ; FALLBACK28-NEXT:    orl %ebx, %ebp
 ; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl %eax, %edi
 ; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %esi, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %edi, %ebp
-; FALLBACK28-NEXT:    movl 20(%edi), %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK28-NEXT:    movl 20(%esi), %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 16(%edi), %esi
-; FALLBACK28-NEXT:    movl %esi, %edx
-; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl 16(%esi), %esi
+; FALLBACK28-NEXT:    movl %esi, %ebp
+; FALLBACK28-NEXT:    shrl %ebp
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %ebp
+; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %esi
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %edi
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
 ; FALLBACK28-NEXT:    orl %esi, %edi
 ; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %ebp, %edx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
 ; FALLBACK28-NEXT:    movl 28(%ebp), %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    movl %eax, %edi
 ; FALLBACK28-NEXT:    shll %cl, %ebx
 ; FALLBACK28-NEXT:    movl 24(%ebp), %esi
-; FALLBACK28-NEXT:    movl %esi, %edi
-; FALLBACK28-NEXT:    shrl %edi
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    orl %ebx, %edi
-; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %ebx, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %edi, %ecx
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %ebp
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    orl %esi, %ebp
-; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 36(%edx), %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %esi, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 36(%ebp), %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 32(%edx), %esi
-; FALLBACK28-NEXT:    movl %edx, %ebp
-; FALLBACK28-NEXT:    movl %esi, %edi
+; FALLBACK28-NEXT:    movl 32(%ebp), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %esi
+; FALLBACK28-NEXT:    shrl %esi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    orl %ebx, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %eax, %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; FALLBACK28-NEXT:    shrl %edi
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    orl %ebx, %edi
+; FALLBACK28-NEXT:    orl %ebp, %edi
 ; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %esi, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 44(%ebp), %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl 44(%eax), %ebx
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl 40(%ebp), %esi
-; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl %esi, %edx
-; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl 40(%eax), %esi
+; FALLBACK28-NEXT:    movl %eax, %ebp
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %ebx, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %esi, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %edi
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %edi
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl 52(%ebp), %esi
 ; FALLBACK28-NEXT:    movl %esi, %edi
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    negl %edx
-; FALLBACK28-NEXT:    movl 176(%esp,%edx), %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    negl %eax
+; FALLBACK28-NEXT:    movl 176(%esp,%eax), %ebx
 ; FALLBACK28-NEXT:    movl %ebx, %ebp
 ; FALLBACK28-NEXT:    shrl %ebp
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %ebp
 ; FALLBACK28-NEXT:    orl %edi, %ebp
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %edx
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    orl %ebx, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; FALLBACK28-NEXT:    movl 60(%edi), %edx
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
+; FALLBACK28-NEXT:    movl 60(%edi), %eax
+; FALLBACK28-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT:    shll %cl, %eax
 ; FALLBACK28-NEXT:    movl 56(%edi), %ebx
 ; FALLBACK28-NEXT:    movl %ebx, %edi
 ; FALLBACK28-NEXT:    shrl %edi
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    orl %edx, %edi
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    orl %eax, %edi
+; FALLBACK28-NEXT:    movl (%esp), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    shll %cl, %ebx
 ; FALLBACK28-NEXT:    shrl %esi
-; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    movl %edx, %ecx
 ; FALLBACK28-NEXT:    shrl %cl, %esi
 ; FALLBACK28-NEXT:    orl %ebx, %esi
 ; FALLBACK28-NEXT:    movl %eax, %ecx
@@ -19437,7 +19475,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK29-NEXT:    movl 8(%eax), %esi
 ; FALLBACK29-NEXT:    movl 12(%eax), %edx
 ; FALLBACK29-NEXT:    shll $3, %ecx
-; FALLBACK29-NEXT:    andl $24, %ecx
 ; FALLBACK29-NEXT:    movl %edx, %edi
 ; FALLBACK29-NEXT:    shldl %cl, %esi, %edi
 ; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -19538,7 +19575,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    leal (,%eax,8), %edx
-; FALLBACK30-NEXT:    andl $24, %edx
 ; FALLBACK30-NEXT:    andl $60, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    leal {{[0-9]+}}(%esp), %edi
@@ -19548,6 +19584,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl 4(%edi), %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    andb $24, %bl
 ; FALLBACK30-NEXT:    notb %bl
 ; FALLBACK30-NEXT:    shrl %ecx
 ; FALLBACK30-NEXT:    shrxl %ebx, %ecx, %esi
@@ -19698,7 +19735,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK31-NEXT:    vmovups %zmm1, {{[0-9]+}}(%esp)
 ; FALLBACK31-NEXT:    vmovups %zmm0, {{[0-9]+}}(%esp)
 ; FALLBACK31-NEXT:    leal (,%ebx,8), %ecx
-; FALLBACK31-NEXT:    andl $24, %ecx
 ; FALLBACK31-NEXT:    andl $60, %ebx
 ; FALLBACK31-NEXT:    leal {{[0-9]+}}(%esp), %eax
 ; FALLBACK31-NEXT:    subl %ebx, %eax
@@ -20199,7 +20235,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; FALLBACK0-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; FALLBACK0-NEXT:    leal (,%rdi,8), %eax
-; FALLBACK0-NEXT:    andl $56, %eax
 ; FALLBACK0-NEXT:    andl $56, %edi
 ; FALLBACK0-NEXT:    movq -128(%rsp,%rdi), %r10
 ; FALLBACK0-NEXT:    movq -120(%rsp,%rdi), %r8
@@ -20207,6 +20242,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK0-NEXT:    movl %eax, %ecx
 ; FALLBACK0-NEXT:    shrq %cl, %r11
 ; FALLBACK0-NEXT:    movl %eax, %esi
+; FALLBACK0-NEXT:    andb $56, %sil
 ; FALLBACK0-NEXT:    notb %sil
 ; FALLBACK0-NEXT:    movq -112(%rsp,%rdi), %rbx
 ; FALLBACK0-NEXT:    leaq (%rbx,%rbx), %r9
@@ -20305,7 +20341,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK1-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK1-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK1-NEXT:    andl $56, %ecx
 ; FALLBACK1-NEXT:    andl $56, %eax
 ; FALLBACK1-NEXT:    movq -112(%rsp,%rax), %rdi
 ; FALLBACK1-NEXT:    movq -128(%rsp,%rax), %rsi
@@ -20376,54 +20411,54 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK2-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK2-NEXT:    andl $56, %ecx
 ; FALLBACK2-NEXT:    andl $56, %eax
 ; FALLBACK2-NEXT:    movq -120(%rsp,%rax), %rdi
-; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r9
-; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %rbx
+; FALLBACK2-NEXT:    movq -112(%rsp,%rax), %r10
+; FALLBACK2-NEXT:    shrxq %rcx, %rdi, %r9
 ; FALLBACK2-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
 ; FALLBACK2-NEXT:    movq -104(%rsp,%rax), %rsi
 ; FALLBACK2-NEXT:    shrxq %rcx, %rsi, %r8
-; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r10
-; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r11
+; FALLBACK2-NEXT:    movq -96(%rsp,%rax), %r11
+; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbx
 ; FALLBACK2-NEXT:    movq -88(%rsp,%rax), %r14
 ; FALLBACK2-NEXT:    shrxq %rcx, %r14, %r15
-; FALLBACK2-NEXT:    shrxq %rcx, %r10, %rbp
+; FALLBACK2-NEXT:    shrxq %rcx, %r11, %rbp
 ; FALLBACK2-NEXT:    movl %ecx, %r12d
+; FALLBACK2-NEXT:    andb $56, %r12b
 ; FALLBACK2-NEXT:    notb %r12b
-; FALLBACK2-NEXT:    addq %r9, %r9
-; FALLBACK2-NEXT:    shlxq %r12, %r9, %r9
-; FALLBACK2-NEXT:    orq %rbx, %r9
+; FALLBACK2-NEXT:    addq %r10, %r10
+; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
+; FALLBACK2-NEXT:    orq %r9, %r10
 ; FALLBACK2-NEXT:    addq %rdi, %rdi
 ; FALLBACK2-NEXT:    shlxq %r12, %rdi, %rdi
 ; FALLBACK2-NEXT:    orq %r13, %rdi
-; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %rbx
-; FALLBACK2-NEXT:    shrxq %rcx, %rbx, %r13
+; FALLBACK2-NEXT:    movq -80(%rsp,%rax), %r9
+; FALLBACK2-NEXT:    shrxq %rcx, %r9, %r13
 ; FALLBACK2-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK2-NEXT:    sarxq %rcx, %rax, %rcx
-; FALLBACK2-NEXT:    addq %r10, %r10
-; FALLBACK2-NEXT:    shlxq %r12, %r10, %r10
-; FALLBACK2-NEXT:    orq %r8, %r10
+; FALLBACK2-NEXT:    addq %r11, %r11
+; FALLBACK2-NEXT:    shlxq %r12, %r11, %r11
+; FALLBACK2-NEXT:    orq %r8, %r11
 ; FALLBACK2-NEXT:    addq %rsi, %rsi
 ; FALLBACK2-NEXT:    shlxq %r12, %rsi, %rsi
-; FALLBACK2-NEXT:    orq %r11, %rsi
-; FALLBACK2-NEXT:    leaq (%rbx,%rbx), %r8
+; FALLBACK2-NEXT:    orq %rbx, %rsi
+; FALLBACK2-NEXT:    leaq (%r9,%r9), %r8
 ; FALLBACK2-NEXT:    shlxq %r12, %r8, %r8
 ; FALLBACK2-NEXT:    orq %r15, %r8
 ; FALLBACK2-NEXT:    addq %r14, %r14
-; FALLBACK2-NEXT:    shlxq %r12, %r14, %r11
-; FALLBACK2-NEXT:    orq %rbp, %r11
+; FALLBACK2-NEXT:    shlxq %r12, %r14, %r9
+; FALLBACK2-NEXT:    orq %rbp, %r9
 ; FALLBACK2-NEXT:    addq %rax, %rax
 ; FALLBACK2-NEXT:    shlxq %r12, %rax, %rax
 ; FALLBACK2-NEXT:    orq %r13, %rax
 ; FALLBACK2-NEXT:    movq %rcx, 56(%rdx)
 ; FALLBACK2-NEXT:    movq %rax, 48(%rdx)
-; FALLBACK2-NEXT:    movq %r11, 32(%rdx)
+; FALLBACK2-NEXT:    movq %r9, 32(%rdx)
 ; FALLBACK2-NEXT:    movq %r8, 40(%rdx)
 ; FALLBACK2-NEXT:    movq %rsi, 16(%rdx)
-; FALLBACK2-NEXT:    movq %r10, 24(%rdx)
+; FALLBACK2-NEXT:    movq %r11, 24(%rdx)
 ; FALLBACK2-NEXT:    movq %rdi, (%rdx)
-; FALLBACK2-NEXT:    movq %r9, 8(%rdx)
+; FALLBACK2-NEXT:    movq %r10, 8(%rdx)
 ; FALLBACK2-NEXT:    addq $8, %rsp
 ; FALLBACK2-NEXT:    popq %rbx
 ; FALLBACK2-NEXT:    popq %r12
@@ -20465,7 +20500,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK3-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK3-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK3-NEXT:    andl $56, %ecx
 ; FALLBACK3-NEXT:    andl $56, %eax
 ; FALLBACK3-NEXT:    movq -112(%rsp,%rax), %rdi
 ; FALLBACK3-NEXT:    movq -128(%rsp,%rax), %rsi
@@ -20484,17 +20518,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK3-NEXT:    shrdq %cl, %r14, %r10
 ; FALLBACK3-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK3-NEXT:    shrdq %cl, %rax, %r11
-; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
-; FALLBACK3-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; FALLBACK3-NEXT:    shrdq %cl, %r9, %rsi
+; FALLBACK3-NEXT:    sarxq %rcx, %rax, %rax
 ; FALLBACK3-NEXT:    movq %r11, 48(%rdx)
+; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK3-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK3-NEXT:    movq %r15, 40(%rdx)
 ; FALLBACK3-NEXT:    movq %rdi, 16(%rdx)
 ; FALLBACK3-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK3-NEXT:    movq %rsi, (%rdx)
 ; FALLBACK3-NEXT:    movq %r8, 8(%rdx)
-; FALLBACK3-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK3-NEXT:    popq %rbx
 ; FALLBACK3-NEXT:    popq %r14
 ; FALLBACK3-NEXT:    popq %r15
@@ -20530,13 +20563,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK4-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK4-NEXT:    leal (,%rdi,8), %eax
-; FALLBACK4-NEXT:    andl $56, %eax
 ; FALLBACK4-NEXT:    andl $56, %edi
 ; FALLBACK4-NEXT:    movq -128(%rsp,%rdi), %r10
 ; FALLBACK4-NEXT:    movq -120(%rsp,%rdi), %r9
 ; FALLBACK4-NEXT:    movl %eax, %ecx
 ; FALLBACK4-NEXT:    shrq %cl, %r10
 ; FALLBACK4-NEXT:    movl %eax, %esi
+; FALLBACK4-NEXT:    andb $56, %sil
 ; FALLBACK4-NEXT:    notb %sil
 ; FALLBACK4-NEXT:    leaq (%r9,%r9), %r8
 ; FALLBACK4-NEXT:    movl %esi, %ecx
@@ -20632,7 +20665,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK5-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK5-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK5-NEXT:    andl $56, %ecx
 ; FALLBACK5-NEXT:    andl $56, %eax
 ; FALLBACK5-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK5-NEXT:    movq -104(%rsp,%rax), %r9
@@ -20698,9 +20730,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK6-NEXT:    leal (,%rax,8), %esi
-; FALLBACK6-NEXT:    andl $56, %esi
 ; FALLBACK6-NEXT:    andl $56, %eax
-; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK6-NEXT:    shrxq %rsi, -128(%rsp,%rax), %rbx
 ; FALLBACK6-NEXT:    movq -112(%rsp,%rax), %rcx
 ; FALLBACK6-NEXT:    movq -104(%rsp,%rax), %rdi
 ; FALLBACK6-NEXT:    shrxq %rsi, %rdi, %r12
@@ -20709,34 +20740,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq -88(%rsp,%rax), %r10
 ; FALLBACK6-NEXT:    shrxq %rsi, %r10, %r14
 ; FALLBACK6-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK6-NEXT:    movl %esi, %ebx
-; FALLBACK6-NEXT:    notb %bl
+; FALLBACK6-NEXT:    movl %esi, %r11d
+; FALLBACK6-NEXT:    andb $56, %r11b
+; FALLBACK6-NEXT:    notb %r11b
 ; FALLBACK6-NEXT:    movq -120(%rsp,%rax), %rbp
 ; FALLBACK6-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK6-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK6-NEXT:    orq %r11, %r8
-; FALLBACK6-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK6-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK6-NEXT:    orq %r12, %r11
+; FALLBACK6-NEXT:    shlxq %r11, %r8, %r8
+; FALLBACK6-NEXT:    orq %rbx, %r8
+; FALLBACK6-NEXT:    leaq (%r13,%r13), %rbx
+; FALLBACK6-NEXT:    shlxq %r11, %rbx, %rbx
+; FALLBACK6-NEXT:    orq %r12, %rbx
 ; FALLBACK6-NEXT:    movq -80(%rsp,%rax), %r12
 ; FALLBACK6-NEXT:    shrxq %rsi, %r12, %r13
 ; FALLBACK6-NEXT:    shrxq %rsi, %rbp, %rbp
 ; FALLBACK6-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK6-NEXT:    sarxq %rsi, %rax, %rsi
 ; FALLBACK6-NEXT:    addq %rdi, %rdi
-; FALLBACK6-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK6-NEXT:    shlxq %r11, %rdi, %rdi
 ; FALLBACK6-NEXT:    orq %r9, %rdi
 ; FALLBACK6-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK6-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK6-NEXT:    shlxq %r11, %r9, %r9
 ; FALLBACK6-NEXT:    orq %r14, %r9
 ; FALLBACK6-NEXT:    addq %r10, %r10
-; FALLBACK6-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK6-NEXT:    shlxq %r11, %r10, %r10
 ; FALLBACK6-NEXT:    orq %r15, %r10
 ; FALLBACK6-NEXT:    addq %rax, %rax
-; FALLBACK6-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK6-NEXT:    shlxq %r11, %rax, %rax
 ; FALLBACK6-NEXT:    orq %r13, %rax
 ; FALLBACK6-NEXT:    addq %rcx, %rcx
-; FALLBACK6-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK6-NEXT:    shlxq %r11, %rcx, %rcx
 ; FALLBACK6-NEXT:    orq %rbp, %rcx
 ; FALLBACK6-NEXT:    movq %rsi, 56(%rdx)
 ; FALLBACK6-NEXT:    movq %rcx, 8(%rdx)
@@ -20744,7 +20776,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK6-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK6-NEXT:    movq %r9, 40(%rdx)
 ; FALLBACK6-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK6-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK6-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK6-NEXT:    movq %r8, (%rdx)
 ; FALLBACK6-NEXT:    addq $8, %rsp
 ; FALLBACK6-NEXT:    popq %rbx
@@ -20781,7 +20813,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK7-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK7-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK7-NEXT:    andl $56, %ecx
 ; FALLBACK7-NEXT:    andl $56, %eax
 ; FALLBACK7-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK7-NEXT:    movq -104(%rsp,%rax), %r9
@@ -20801,17 +20832,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK7-NEXT:    movq -120(%rsp,%rax), %rax
 ; FALLBACK7-NEXT:    movq %rax, %r15
 ; FALLBACK7-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK7-NEXT:    sarxq %rcx, %r11, %r10
-; FALLBACK7-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; FALLBACK7-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK7-NEXT:    sarxq %rcx, %r11, %rax
 ; FALLBACK7-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK7-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK7-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK7-NEXT:    movq %rdi, 32(%rdx)
 ; FALLBACK7-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK7-NEXT:    movq %r8, 16(%rdx)
 ; FALLBACK7-NEXT:    movq %rsi, 24(%rdx)
 ; FALLBACK7-NEXT:    movq %r14, (%rdx)
-; FALLBACK7-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK7-NEXT:    popq %rbx
 ; FALLBACK7-NEXT:    popq %r14
 ; FALLBACK7-NEXT:    popq %r15
@@ -20845,13 +20875,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK8-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK8-NEXT:    leal (,%rdi,8), %eax
-; FALLBACK8-NEXT:    andl $56, %eax
 ; FALLBACK8-NEXT:    andl $56, %edi
 ; FALLBACK8-NEXT:    movq -128(%rsp,%rdi), %r10
 ; FALLBACK8-NEXT:    movq -120(%rsp,%rdi), %r9
 ; FALLBACK8-NEXT:    movl %eax, %ecx
 ; FALLBACK8-NEXT:    shrq %cl, %r10
 ; FALLBACK8-NEXT:    movl %eax, %esi
+; FALLBACK8-NEXT:    andb $56, %sil
 ; FALLBACK8-NEXT:    notb %sil
 ; FALLBACK8-NEXT:    leaq (%r9,%r9), %r8
 ; FALLBACK8-NEXT:    movl %esi, %ecx
@@ -20946,7 +20976,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK9-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK9-NEXT:    andl $56, %ecx
 ; FALLBACK9-NEXT:    andl $56, %eax
 ; FALLBACK9-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK9-NEXT:    movq -104(%rsp,%rax), %r9
@@ -21011,9 +21040,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK10-NEXT:    leal (,%rax,8), %esi
-; FALLBACK10-NEXT:    andl $56, %esi
 ; FALLBACK10-NEXT:    andl $56, %eax
-; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK10-NEXT:    shrxq %rsi, -128(%rsp,%rax), %rbx
 ; FALLBACK10-NEXT:    movq -112(%rsp,%rax), %rcx
 ; FALLBACK10-NEXT:    movq -104(%rsp,%rax), %rdi
 ; FALLBACK10-NEXT:    shrxq %rsi, %rdi, %r12
@@ -21022,34 +21050,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq -88(%rsp,%rax), %r10
 ; FALLBACK10-NEXT:    shrxq %rsi, %r10, %r14
 ; FALLBACK10-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK10-NEXT:    movl %esi, %ebx
-; FALLBACK10-NEXT:    notb %bl
+; FALLBACK10-NEXT:    movl %esi, %r11d
+; FALLBACK10-NEXT:    andb $56, %r11b
+; FALLBACK10-NEXT:    notb %r11b
 ; FALLBACK10-NEXT:    movq -120(%rsp,%rax), %rbp
 ; FALLBACK10-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK10-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK10-NEXT:    orq %r11, %r8
-; FALLBACK10-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK10-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK10-NEXT:    orq %r12, %r11
+; FALLBACK10-NEXT:    shlxq %r11, %r8, %r8
+; FALLBACK10-NEXT:    orq %rbx, %r8
+; FALLBACK10-NEXT:    leaq (%r13,%r13), %rbx
+; FALLBACK10-NEXT:    shlxq %r11, %rbx, %rbx
+; FALLBACK10-NEXT:    orq %r12, %rbx
 ; FALLBACK10-NEXT:    movq -80(%rsp,%rax), %r12
 ; FALLBACK10-NEXT:    shrxq %rsi, %r12, %r13
 ; FALLBACK10-NEXT:    shrxq %rsi, %rbp, %rbp
 ; FALLBACK10-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK10-NEXT:    sarxq %rsi, %rax, %rsi
 ; FALLBACK10-NEXT:    addq %rdi, %rdi
-; FALLBACK10-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK10-NEXT:    shlxq %r11, %rdi, %rdi
 ; FALLBACK10-NEXT:    orq %r9, %rdi
 ; FALLBACK10-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK10-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK10-NEXT:    shlxq %r11, %r9, %r9
 ; FALLBACK10-NEXT:    orq %r14, %r9
 ; FALLBACK10-NEXT:    addq %r10, %r10
-; FALLBACK10-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK10-NEXT:    shlxq %r11, %r10, %r10
 ; FALLBACK10-NEXT:    orq %r15, %r10
 ; FALLBACK10-NEXT:    addq %rax, %rax
-; FALLBACK10-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK10-NEXT:    shlxq %r11, %rax, %rax
 ; FALLBACK10-NEXT:    orq %r13, %rax
 ; FALLBACK10-NEXT:    addq %rcx, %rcx
-; FALLBACK10-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK10-NEXT:    shlxq %r11, %rcx, %rcx
 ; FALLBACK10-NEXT:    orq %rbp, %rcx
 ; FALLBACK10-NEXT:    movq %rsi, 56(%rdx)
 ; FALLBACK10-NEXT:    movq %rcx, 8(%rdx)
@@ -21057,7 +21086,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK10-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK10-NEXT:    movq %r9, 40(%rdx)
 ; FALLBACK10-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK10-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK10-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK10-NEXT:    movq %r8, (%rdx)
 ; FALLBACK10-NEXT:    addq $8, %rsp
 ; FALLBACK10-NEXT:    popq %rbx
@@ -21093,7 +21122,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK11-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK11-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK11-NEXT:    andl $56, %ecx
 ; FALLBACK11-NEXT:    andl $56, %eax
 ; FALLBACK11-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK11-NEXT:    movq -104(%rsp,%rax), %r9
@@ -21113,17 +21141,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK11-NEXT:    movq -120(%rsp,%rax), %rax
 ; FALLBACK11-NEXT:    movq %rax, %r15
 ; FALLBACK11-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK11-NEXT:    sarxq %rcx, %r11, %r10
-; FALLBACK11-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; FALLBACK11-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK11-NEXT:    sarxq %rcx, %r11, %rax
 ; FALLBACK11-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK11-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK11-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK11-NEXT:    movq %rdi, 32(%rdx)
 ; FALLBACK11-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK11-NEXT:    movq %r8, 16(%rdx)
 ; FALLBACK11-NEXT:    movq %rsi, 24(%rdx)
 ; FALLBACK11-NEXT:    movq %r14, (%rdx)
-; FALLBACK11-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK11-NEXT:    popq %rbx
 ; FALLBACK11-NEXT:    popq %r14
 ; FALLBACK11-NEXT:    popq %r15
@@ -21158,13 +21185,13 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; FALLBACK12-NEXT:    leal (,%rdi,8), %eax
-; FALLBACK12-NEXT:    andl $56, %eax
 ; FALLBACK12-NEXT:    andl $56, %edi
 ; FALLBACK12-NEXT:    movq -128(%rsp,%rdi), %r10
 ; FALLBACK12-NEXT:    movq -120(%rsp,%rdi), %r9
 ; FALLBACK12-NEXT:    movl %eax, %ecx
 ; FALLBACK12-NEXT:    shrq %cl, %r10
 ; FALLBACK12-NEXT:    movl %eax, %esi
+; FALLBACK12-NEXT:    andb $56, %sil
 ; FALLBACK12-NEXT:    notb %sil
 ; FALLBACK12-NEXT:    leaq (%r9,%r9), %r8
 ; FALLBACK12-NEXT:    movl %esi, %ecx
@@ -21259,7 +21286,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK13-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK13-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK13-NEXT:    andl $56, %ecx
 ; FALLBACK13-NEXT:    andl $56, %eax
 ; FALLBACK13-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK13-NEXT:    movq -104(%rsp,%rax), %r9
@@ -21324,9 +21350,8 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK14-NEXT:    leal (,%rax,8), %esi
-; FALLBACK14-NEXT:    andl $56, %esi
 ; FALLBACK14-NEXT:    andl $56, %eax
-; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %r11
+; FALLBACK14-NEXT:    shrxq %rsi, -128(%rsp,%rax), %rbx
 ; FALLBACK14-NEXT:    movq -112(%rsp,%rax), %rcx
 ; FALLBACK14-NEXT:    movq -104(%rsp,%rax), %rdi
 ; FALLBACK14-NEXT:    shrxq %rsi, %rdi, %r12
@@ -21335,34 +21360,35 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq -88(%rsp,%rax), %r10
 ; FALLBACK14-NEXT:    shrxq %rsi, %r10, %r14
 ; FALLBACK14-NEXT:    shrxq %rsi, %r13, %r15
-; FALLBACK14-NEXT:    movl %esi, %ebx
-; FALLBACK14-NEXT:    notb %bl
+; FALLBACK14-NEXT:    movl %esi, %r11d
+; FALLBACK14-NEXT:    andb $56, %r11b
+; FALLBACK14-NEXT:    notb %r11b
 ; FALLBACK14-NEXT:    movq -120(%rsp,%rax), %rbp
 ; FALLBACK14-NEXT:    leaq (%rbp,%rbp), %r8
-; FALLBACK14-NEXT:    shlxq %rbx, %r8, %r8
-; FALLBACK14-NEXT:    orq %r11, %r8
-; FALLBACK14-NEXT:    leaq (%r13,%r13), %r11
-; FALLBACK14-NEXT:    shlxq %rbx, %r11, %r11
-; FALLBACK14-NEXT:    orq %r12, %r11
+; FALLBACK14-NEXT:    shlxq %r11, %r8, %r8
+; FALLBACK14-NEXT:    orq %rbx, %r8
+; FALLBACK14-NEXT:    leaq (%r13,%r13), %rbx
+; FALLBACK14-NEXT:    shlxq %r11, %rbx, %rbx
+; FALLBACK14-NEXT:    orq %r12, %rbx
 ; FALLBACK14-NEXT:    movq -80(%rsp,%rax), %r12
 ; FALLBACK14-NEXT:    shrxq %rsi, %r12, %r13
 ; FALLBACK14-NEXT:    shrxq %rsi, %rbp, %rbp
 ; FALLBACK14-NEXT:    movq -72(%rsp,%rax), %rax
 ; FALLBACK14-NEXT:    sarxq %rsi, %rax, %rsi
 ; FALLBACK14-NEXT:    addq %rdi, %rdi
-; FALLBACK14-NEXT:    shlxq %rbx, %rdi, %rdi
+; FALLBACK14-NEXT:    shlxq %r11, %rdi, %rdi
 ; FALLBACK14-NEXT:    orq %r9, %rdi
 ; FALLBACK14-NEXT:    leaq (%r12,%r12), %r9
-; FALLBACK14-NEXT:    shlxq %rbx, %r9, %r9
+; FALLBACK14-NEXT:    shlxq %r11, %r9, %r9
 ; FALLBACK14-NEXT:    orq %r14, %r9
 ; FALLBACK14-NEXT:    addq %r10, %r10
-; FALLBACK14-NEXT:    shlxq %rbx, %r10, %r10
+; FALLBACK14-NEXT:    shlxq %r11, %r10, %r10
 ; FALLBACK14-NEXT:    orq %r15, %r10
 ; FALLBACK14-NEXT:    addq %rax, %rax
-; FALLBACK14-NEXT:    shlxq %rbx, %rax, %rax
+; FALLBACK14-NEXT:    shlxq %r11, %rax, %rax
 ; FALLBACK14-NEXT:    orq %r13, %rax
 ; FALLBACK14-NEXT:    addq %rcx, %rcx
-; FALLBACK14-NEXT:    shlxq %rbx, %rcx, %rcx
+; FALLBACK14-NEXT:    shlxq %r11, %rcx, %rcx
 ; FALLBACK14-NEXT:    orq %rbp, %rcx
 ; FALLBACK14-NEXT:    movq %rsi, 56(%rdx)
 ; FALLBACK14-NEXT:    movq %rcx, 8(%rdx)
@@ -21370,7 +21396,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK14-NEXT:    movq %r10, 32(%rdx)
 ; FALLBACK14-NEXT:    movq %r9, 40(%rdx)
 ; FALLBACK14-NEXT:    movq %rdi, 16(%rdx)
-; FALLBACK14-NEXT:    movq %r11, 24(%rdx)
+; FALLBACK14-NEXT:    movq %rbx, 24(%rdx)
 ; FALLBACK14-NEXT:    movq %r8, (%rdx)
 ; FALLBACK14-NEXT:    addq $8, %rsp
 ; FALLBACK14-NEXT:    popq %rbx
@@ -21406,7 +21432,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK15-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; FALLBACK15-NEXT:    leal (,%rax,8), %ecx
-; FALLBACK15-NEXT:    andl $56, %ecx
 ; FALLBACK15-NEXT:    andl $56, %eax
 ; FALLBACK15-NEXT:    movq -96(%rsp,%rax), %rdi
 ; FALLBACK15-NEXT:    movq -104(%rsp,%rax), %r9
@@ -21426,17 +21451,16 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK15-NEXT:    movq -120(%rsp,%rax), %rax
 ; FALLBACK15-NEXT:    movq %rax, %r15
 ; FALLBACK15-NEXT:    shrdq %cl, %r10, %r15
-; FALLBACK15-NEXT:    sarxq %rcx, %r11, %r10
-; FALLBACK15-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; FALLBACK15-NEXT:    shrdq %cl, %rax, %r14
+; FALLBACK15-NEXT:    sarxq %rcx, %r11, %rax
 ; FALLBACK15-NEXT:    movq %r15, 8(%rdx)
 ; FALLBACK15-NEXT:    movq %r9, 48(%rdx)
+; FALLBACK15-NEXT:    movq %rax, 56(%rdx)
 ; FALLBACK15-NEXT:    movq %rdi, 32(%rdx)
 ; FALLBACK15-NEXT:    movq %rbx, 40(%rdx)
 ; FALLBACK15-NEXT:    movq %r8, 16(%rdx)
 ; FALLBACK15-NEXT:    movq %rsi, 24(%rdx)
 ; FALLBACK15-NEXT:    movq %r14, (%rdx)
-; FALLBACK15-NEXT:    movq %r10, 56(%rdx)
 ; FALLBACK15-NEXT:    popq %rbx
 ; FALLBACK15-NEXT:    popq %r14
 ; FALLBACK15-NEXT:    popq %r15
@@ -21522,46 +21546,46 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK16-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK16-NEXT:    movl %ebp, %ecx
-; FALLBACK16-NEXT:    movl %ebp, %esi
-; FALLBACK16-NEXT:    andl $60, %esi
-; FALLBACK16-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK16-NEXT:    shll $3, %ecx
-; FALLBACK16-NEXT:    andl $24, %ecx
-; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    andl $60, %eax
+; FALLBACK16-NEXT:    movl 68(%esp,%eax), %edi
+; FALLBACK16-NEXT:    movl %eax, %ebx
+; FALLBACK16-NEXT:    shll $3, %edx
+; FALLBACK16-NEXT:    movl %edi, %eax
+; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 72(%esp,%esi), %edi
-; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    addl %edi, %edi
-; FALLBACK16-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK16-NEXT:    movl %ecx, %ebx
-; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movl 72(%esp,%ebx), %ecx
+; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK16-NEXT:    movb %dl, %ch
+; FALLBACK16-NEXT:    andb $24, %ch
 ; FALLBACK16-NEXT:    notb %ch
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %eax, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
+; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    addl %edi, %edi
+; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edi
 ; FALLBACK16-NEXT:    orl %eax, %edi
 ; FALLBACK16-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 64(%esp,%esi), %eax
-; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    addl %edx, %edx
+; FALLBACK16-NEXT:    movl 80(%esp,%ebx), %edi
+; FALLBACK16-NEXT:    leal (%edi,%edi), %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    shll %cl, %esi
+; FALLBACK16-NEXT:    orl %eax, %esi
+; FALLBACK16-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 76(%esp,%esi), %ebp
-; FALLBACK16-NEXT:    movl %ebp, %edx
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %edx
-; FALLBACK16-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK16-NEXT:    leal (%edi,%edi), %eax
-; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %eax
-; FALLBACK16-NEXT:    orl %edx, %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; FALLBACK16-NEXT:    shrl %cl, %eax
 ; FALLBACK16-NEXT:    addl %ebp, %ebp
@@ -21569,30 +21593,28 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    shll %cl, %ebp
 ; FALLBACK16-NEXT:    orl %eax, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %esi, %edx
-; FALLBACK16-NEXT:    movl 84(%esp,%esi), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 84(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %eax
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 88(%esp,%esi), %esi
-; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT:    movl 88(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %edx
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    shll %cl, %edx
+; FALLBACK16-NEXT:    orl %eax, %edx
+; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK16-NEXT:    movb %bl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    addl %ebp, %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %edx, %eax
-; FALLBACK16-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl 92(%esp,%edx), %ebp
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK16-NEXT:    movl 92(%esp,%eax), %ebp
 ; FALLBACK16-NEXT:    movl %ebp, %edx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; FALLBACK16-NEXT:    movb %bl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edx
 ; FALLBACK16-NEXT:    movl 96(%esp,%eax), %edi
@@ -21602,87 +21624,85 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK16-NEXT:    orl %edx, %eax
 ; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movl %ebx, %eax
 ; FALLBACK16-NEXT:    shrl %cl, %esi
 ; FALLBACK16-NEXT:    addl %ebp, %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
 ; FALLBACK16-NEXT:    orl %esi, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl 100(%esp,%edx), %eax
-; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 104(%esp,%edx), %esi
-; FALLBACK16-NEXT:    leal (%esi,%esi), %ebp
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 100(%esp,%ebx), %ebp
+; FALLBACK16-NEXT:    movl %ebp, %edx
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl 104(%esp,%ebx), %esi
+; FALLBACK16-NEXT:    leal (%esi,%esi), %eax
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebp
-; FALLBACK16-NEXT:    orl %eax, %ebp
-; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    shll %cl, %eax
+; FALLBACK16-NEXT:    orl %edx, %eax
+; FALLBACK16-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %edi
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK16-NEXT:    addl %ebx, %ebx
+; FALLBACK16-NEXT:    addl %ebp, %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %ebx
-; FALLBACK16-NEXT:    orl %edi, %ebx
-; FALLBACK16-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; FALLBACK16-NEXT:    movl 108(%esp,%ebp), %edi
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edi, %ebp
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl 108(%esp,%ebx), %edi
 ; FALLBACK16-NEXT:    movl %edi, %eax
-; FALLBACK16-NEXT:    movl %edx, %ebx
-; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    movl %edx, %ecx
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 112(%esp,%ebp), %ecx
+; FALLBACK16-NEXT:    movl 112(%esp,%ebx), %ecx
 ; FALLBACK16-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movl %ebp, %edx
 ; FALLBACK16-NEXT:    leal (%ecx,%ecx), %ebp
 ; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
 ; FALLBACK16-NEXT:    orl %eax, %ebp
 ; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK16-NEXT:    movb %bl, %cl
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %esi
 ; FALLBACK16-NEXT:    addl %edi, %edi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %edi
 ; FALLBACK16-NEXT:    orl %esi, %edi
-; FALLBACK16-NEXT:    movl 116(%esp,%edx), %esi
+; FALLBACK16-NEXT:    movl 116(%esp,%ebx), %esi
 ; FALLBACK16-NEXT:    movl %esi, %eax
-; FALLBACK16-NEXT:    movl %ebx, %ecx
+; FALLBACK16-NEXT:    movb %dl, %cl
 ; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl 120(%esp,%edx), %edx
-; FALLBACK16-NEXT:    leal (%edx,%edx), %ebp
-; FALLBACK16-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; FALLBACK16-NEXT:    movl 120(%esp,%ebx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %ebp
 ; FALLBACK16-NEXT:    orl %eax, %ebp
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK16-NEXT:    shrl %cl, %eax
+; FALLBACK16-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK16-NEXT:    movl %edx, %eax
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK16-NEXT:    shrl %cl, %edx
 ; FALLBACK16-NEXT:    addl %esi, %esi
 ; FALLBACK16-NEXT:    movb %ch, %cl
 ; FALLBACK16-NEXT:    shll %cl, %esi
-; FALLBACK16-NEXT:    orl %eax, %esi
-; FALLBACK16-NEXT:    movb %bl, %cl
-; FALLBACK16-NEXT:    movl %edx, %eax
-; FALLBACK16-NEXT:    shrl %cl, %eax
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK16-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK16-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK16-NEXT:    orl %edx, %esi
+; FALLBACK16-NEXT:    movb %al, %cl
+; FALLBACK16-NEXT:    movl %ebx, %edx
+; FALLBACK16-NEXT:    shrl %cl, %edx
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK16-NEXT:    movl 124(%esp,%ebx), %ebx
+; FALLBACK16-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK16-NEXT:    movb %ch, %cl
-; FALLBACK16-NEXT:    shll %cl, %edx
-; FALLBACK16-NEXT:    orl %eax, %edx
-; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK16-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK16-NEXT:    shll %cl, %ebp
+; FALLBACK16-NEXT:    orl %edx, %ebp
+; FALLBACK16-NEXT:    movl %eax, %ecx
 ; FALLBACK16-NEXT:    sarl %cl, %ebx
 ; FALLBACK16-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK16-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK16-NEXT:    movl %edx, 56(%eax)
+; FALLBACK16-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK16-NEXT:    movl %esi, 48(%eax)
-; FALLBACK16-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK16-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK16-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK16-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK16-NEXT:    movl %ecx, 44(%eax)
@@ -21792,91 +21812,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK17-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK17-NEXT:    movl %ecx, %ebp
-; FALLBACK17-NEXT:    andl $60, %ebp
-; FALLBACK17-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK17-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shll $3, %ecx
-; FALLBACK17-NEXT:    andl $24, %ecx
-; FALLBACK17-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK17-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK17-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %esi
-; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK17-NEXT:    movl %ecx, %eax
+; FALLBACK17-NEXT:    andl $60, %eax
+; FALLBACK17-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK17-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK17-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edx
+; FALLBACK17-NEXT:    shll $3, %ecx
 ; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK17-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %edi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK17-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edx
-; FALLBACK17-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK17-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK17-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edx
-; FALLBACK17-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl %esi, %edx
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK17-NEXT:    movl %edi, (%esp) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK17-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edi
-; FALLBACK17-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK17-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK17-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    movl %eax, %edi
 ; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK17-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK17-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK17-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK17-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %esi
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl %edi, %esi
 ; FALLBACK17-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK17-NEXT:    movl %ebx, (%esp) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK17-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK17-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK17-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK17-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK17-NEXT:    movl %edx, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK17-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK17-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK17-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK17-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK17-NEXT:    movl %esi, 56(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK17-NEXT:    shrdl %cl, %esi, %ebp
 ; FALLBACK17-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK17-NEXT:    sarl %cl, %eax
-; FALLBACK17-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK17-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK17-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK17-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK17-NEXT:    movl %ebx, (%ebp)
-; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK17-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK17-NEXT:    sarl %cl, %edx
+; FALLBACK17-NEXT:    movl %edx, 60(%eax)
+; FALLBACK17-NEXT:    movl %edi, 48(%eax)
+; FALLBACK17-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK17-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK17-NEXT:    movl %ebp, (%eax)
+; FALLBACK17-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK17-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK17-NEXT:    addl $188, %esp
 ; FALLBACK17-NEXT:    popl %esi
 ; FALLBACK17-NEXT:    popl %edi
@@ -21965,7 +21984,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK18-NEXT:    movl %eax, %ecx
 ; FALLBACK18-NEXT:    leal (,%eax,8), %edx
-; FALLBACK18-NEXT:    andl $24, %edx
 ; FALLBACK18-NEXT:    andl $60, %ecx
 ; FALLBACK18-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK18-NEXT:    movl 72(%esp,%ecx), %edi
@@ -21973,6 +21991,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK18-NEXT:    shrxl %edx, %esi, %eax
 ; FALLBACK18-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK18-NEXT:    movl %edx, %ebx
+; FALLBACK18-NEXT:    andb $24, %bl
 ; FALLBACK18-NEXT:    notb %bl
 ; FALLBACK18-NEXT:    leal (%edi,%edi), %ebp
 ; FALLBACK18-NEXT:    shlxl %ebx, %ebp, %eax
@@ -22185,7 +22204,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK19-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK19-NEXT:    shll $3, %ecx
-; FALLBACK19-NEXT:    andl $24, %ecx
 ; FALLBACK19-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK19-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK19-NEXT:    movl 64(%esp,%ebp), %edi
@@ -22310,159 +22328,162 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK20-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK20-NEXT:    movl %eax, %esi
-; FALLBACK20-NEXT:    andl $60, %esi
-; FALLBACK20-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK20-NEXT:    shll $3, %eax
-; FALLBACK20-NEXT:    andl $24, %eax
-; FALLBACK20-NEXT:    movl %edx, %edi
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    movl %eax, %edx
+; FALLBACK20-NEXT:    andl $60, %eax
+; FALLBACK20-NEXT:    movl 68(%esp,%eax), %edi
+; FALLBACK20-NEXT:    movl %eax, %ebx
+; FALLBACK20-NEXT:    shll $3, %edx
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 72(%esp,%ebx), %ecx
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK20-NEXT:    movb %al, %ch
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK20-NEXT:    movb %dl, %ch
+; FALLBACK20-NEXT:    andb $24, %ch
 ; FALLBACK20-NEXT:    notb %ch
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %edi, %edi
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %edi, %edx
+; FALLBACK20-NEXT:    shll %cl, %edi
+; FALLBACK20-NEXT:    orl %eax, %edi
+; FALLBACK20-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %eax
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 80(%esp,%ebx), %edi
+; FALLBACK20-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK20-NEXT:    movb %ch, %cl
+; FALLBACK20-NEXT:    shll %cl, %esi
+; FALLBACK20-NEXT:    orl %eax, %esi
+; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
 ; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK20-NEXT:    movl %edx, %ebp
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK20-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    addl %ebp, %ebp
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    addl %edx, %edx
+; FALLBACK20-NEXT:    movl 84(%esp,%ebx), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %eax
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 88(%esp,%ebx), %esi
+; FALLBACK20-NEXT:    leal (%esi,%esi), %edx
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %ebx, %edx
+; FALLBACK20-NEXT:    orl %eax, %edx
 ; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movl %eax, %edx
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    addl %eax, %eax
-; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
-; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    movb %bl, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    addl %ebp, %ebp
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK20-NEXT:    movl 92(%esp,%eax), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %edx
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl 96(%esp,%eax), %edi
 ; FALLBACK20-NEXT:    leal (%edi,%edi), %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    orl %edx, %eax
 ; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    movb %bl, %cl
+; FALLBACK20-NEXT:    movl %ebx, %eax
+; FALLBACK20-NEXT:    shrl %cl, %esi
+; FALLBACK20-NEXT:    addl %ebp, %ebp
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %eax, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK20-NEXT:    movl %ebx, %ebp
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK20-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %esi, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 100(%esp,%ebx), %ebp
+; FALLBACK20-NEXT:    movl %ebp, %edx
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl 104(%esp,%ebx), %esi
+; FALLBACK20-NEXT:    leal (%esi,%esi), %eax
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %eax
-; FALLBACK20-NEXT:    orl %ebp, %eax
+; FALLBACK20-NEXT:    orl %edx, %eax
 ; FALLBACK20-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK20-NEXT:    movb %dl, %cl
 ; FALLBACK20-NEXT:    shrl %cl, %edi
-; FALLBACK20-NEXT:    addl %ebx, %ebx
+; FALLBACK20-NEXT:    addl %ebp, %ebp
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %edi, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK20-NEXT:    movl %edi, %ebp
-; FALLBACK20-NEXT:    movl %eax, %ecx
-; FALLBACK20-NEXT:    shrl %cl, %ebp
-; FALLBACK20-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %edi, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl 108(%esp,%ebx), %edi
+; FALLBACK20-NEXT:    movl %edi, %eax
+; FALLBACK20-NEXT:    movl %edx, %ecx
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 112(%esp,%ebx), %ecx
 ; FALLBACK20-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK20-NEXT:    leal (%ecx,%ecx), %ebp
 ; FALLBACK20-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %ebx
-; FALLBACK20-NEXT:    orl %ebp, %ebx
-; FALLBACK20-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %esi
 ; FALLBACK20-NEXT:    addl %edi, %edi
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %edi
-; FALLBACK20-NEXT:    orl %edx, %edi
-; FALLBACK20-NEXT:    movl %esi, %edx
-; FALLBACK20-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK20-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK20-NEXT:    movl %esi, %ebx
-; FALLBACK20-NEXT:    movb %al, %cl
-; FALLBACK20-NEXT:    shrl %cl, %ebx
-; FALLBACK20-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK20-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK20-NEXT:    orl %esi, %edi
+; FALLBACK20-NEXT:    movl 116(%esp,%ebx), %esi
+; FALLBACK20-NEXT:    movl %esi, %eax
+; FALLBACK20-NEXT:    movb %dl, %cl
+; FALLBACK20-NEXT:    shrl %cl, %eax
+; FALLBACK20-NEXT:    movl 120(%esp,%ebx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %ebp
-; FALLBACK20-NEXT:    orl %ebx, %ebp
+; FALLBACK20-NEXT:    orl %eax, %ebp
+; FALLBACK20-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK20-NEXT:    movl %edx, %eax
+; FALLBACK20-NEXT:    movb %al, %cl
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK20-NEXT:    shrl %cl, %ebx
+; FALLBACK20-NEXT:    shrl %cl, %edx
 ; FALLBACK20-NEXT:    addl %esi, %esi
 ; FALLBACK20-NEXT:    movb %ch, %cl
 ; FALLBACK20-NEXT:    shll %cl, %esi
-; FALLBACK20-NEXT:    orl %ebx, %esi
-; FALLBACK20-NEXT:    movb %dl, %cl
-; FALLBACK20-NEXT:    shrl %cl, %eax
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK20-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK20-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK20-NEXT:    orl %edx, %esi
+; FALLBACK20-NEXT:    movb %al, %cl
+; FALLBACK20-NEXT:    movl %ebx, %edx
+; FALLBACK20-NEXT:    shrl %cl, %edx
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK20-NEXT:    movl 124(%esp,%ebx), %ebx
+; FALLBACK20-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK20-NEXT:    movb %ch, %cl
-; FALLBACK20-NEXT:    shll %cl, %edx
-; FALLBACK20-NEXT:    orl %eax, %edx
-; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK20-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK20-NEXT:    shll %cl, %ebp
+; FALLBACK20-NEXT:    orl %edx, %ebp
+; FALLBACK20-NEXT:    movl %eax, %ecx
 ; FALLBACK20-NEXT:    sarl %cl, %ebx
 ; FALLBACK20-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK20-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK20-NEXT:    movl %edx, 56(%eax)
+; FALLBACK20-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK20-NEXT:    movl %esi, 48(%eax)
-; FALLBACK20-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK20-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK20-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK20-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK20-NEXT:    movl %ecx, 44(%eax)
@@ -22534,91 +22555,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK21-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK21-NEXT:    movl %ecx, %ebp
-; FALLBACK21-NEXT:    andl $60, %ebp
-; FALLBACK21-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK21-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shll $3, %ecx
-; FALLBACK21-NEXT:    andl $24, %ecx
-; FALLBACK21-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK21-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK21-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %esi
-; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK21-NEXT:    movl %ecx, %eax
+; FALLBACK21-NEXT:    andl $60, %eax
+; FALLBACK21-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK21-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK21-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edx
+; FALLBACK21-NEXT:    shll $3, %ecx
 ; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK21-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK21-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %edi
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK21-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edx
-; FALLBACK21-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK21-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK21-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edx
-; FALLBACK21-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK21-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl %esi, %edx
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK21-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edi
-; FALLBACK21-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; FALLBACK21-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK21-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    movl %eax, %edi
 ; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK21-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK21-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK21-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK21-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %esi
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK21-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl %edi, %esi
 ; FALLBACK21-NEXT:    shrdl %cl, %edx, %ebx
-; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK21-NEXT:    sarl %cl, %eax
-; FALLBACK21-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK21-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK21-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK21-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK21-NEXT:    movl %ebx, (%ebp)
-; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK21-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK21-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK21-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK21-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK21-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK21-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK21-NEXT:    movl %edx, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK21-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK21-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK21-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK21-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK21-NEXT:    movl %esi, 56(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK21-NEXT:    shrdl %cl, %esi, %ebp
+; FALLBACK21-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK21-NEXT:    sarl %cl, %edx
+; FALLBACK21-NEXT:    movl %edx, 60(%eax)
+; FALLBACK21-NEXT:    movl %edi, 48(%eax)
+; FALLBACK21-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK21-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK21-NEXT:    movl %ebp, (%eax)
+; FALLBACK21-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK21-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK21-NEXT:    addl $188, %esp
 ; FALLBACK21-NEXT:    popl %esi
 ; FALLBACK21-NEXT:    popl %edi
@@ -22669,7 +22689,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK22-NEXT:    movl %eax, %ecx
 ; FALLBACK22-NEXT:    leal (,%eax,8), %edx
-; FALLBACK22-NEXT:    andl $24, %edx
 ; FALLBACK22-NEXT:    andl $60, %ecx
 ; FALLBACK22-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK22-NEXT:    movl 72(%esp,%ecx), %edi
@@ -22677,6 +22696,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK22-NEXT:    shrxl %edx, %esi, %eax
 ; FALLBACK22-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK22-NEXT:    movl %edx, %ebx
+; FALLBACK22-NEXT:    andb $24, %bl
 ; FALLBACK22-NEXT:    notb %bl
 ; FALLBACK22-NEXT:    leal (%edi,%edi), %ebp
 ; FALLBACK22-NEXT:    shlxl %ebx, %ebp, %eax
@@ -22851,7 +22871,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK23-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    shll $3, %ecx
-; FALLBACK23-NEXT:    andl $24, %ecx
 ; FALLBACK23-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK23-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK23-NEXT:    movl 64(%esp,%ebp), %edi
@@ -22942,191 +22961,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK24-NEXT:    pushl %edi
 ; FALLBACK24-NEXT:    pushl %esi
 ; FALLBACK24-NEXT:    subl $204, %esp
-; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK24-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK24-NEXT:    vmovups 32(%ecx), %xmm1
-; FALLBACK24-NEXT:    movl 48(%ecx), %edx
-; FALLBACK24-NEXT:    movl 52(%ecx), %esi
-; FALLBACK24-NEXT:    movl 56(%ecx), %edi
-; FALLBACK24-NEXT:    movl 60(%ecx), %ecx
-; FALLBACK24-NEXT:    movl (%eax), %eax
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK24-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK24-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK24-NEXT:    movl 48(%eax), %edx
+; FALLBACK24-NEXT:    movl 52(%eax), %esi
+; FALLBACK24-NEXT:    movl 56(%eax), %edi
+; FALLBACK24-NEXT:    movl 60(%eax), %eax
+; FALLBACK24-NEXT:    movl (%ecx), %ecx
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK24-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    sarl $31, %ecx
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK24-NEXT:    movl %eax, %esi
-; FALLBACK24-NEXT:    andl $60, %esi
-; FALLBACK24-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK24-NEXT:    shll $3, %eax
-; FALLBACK24-NEXT:    andl $24, %eax
-; FALLBACK24-NEXT:    movl %edx, %edi
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    sarl $31, %eax
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK24-NEXT:    movl %ecx, %edx
+; FALLBACK24-NEXT:    movl %ecx, %eax
+; FALLBACK24-NEXT:    andl $60, %eax
+; FALLBACK24-NEXT:    movl 68(%esp,%eax), %edi
+; FALLBACK24-NEXT:    movl %eax, %ebx
+; FALLBACK24-NEXT:    shll $3, %edx
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 72(%esp,%ebx), %ecx
 ; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK24-NEXT:    movb %al, %ch
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK24-NEXT:    movb %dl, %ch
+; FALLBACK24-NEXT:    andb $24, %ch
 ; FALLBACK24-NEXT:    notb %ch
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %edi, %edi
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %edi, %edx
+; FALLBACK24-NEXT:    shll %cl, %edi
+; FALLBACK24-NEXT:    orl %eax, %edi
+; FALLBACK24-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 80(%esp,%ebx), %edi
+; FALLBACK24-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK24-NEXT:    movb %ch, %cl
+; FALLBACK24-NEXT:    shll %cl, %esi
+; FALLBACK24-NEXT:    orl %eax, %esi
+; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
 ; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK24-NEXT:    movl %edx, %ebp
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK24-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    addl %ebp, %ebp
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    addl %edx, %edx
+; FALLBACK24-NEXT:    movl 84(%esp,%ebx), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %eax
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 88(%esp,%ebx), %esi
+; FALLBACK24-NEXT:    leal (%esi,%esi), %edx
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %ebx, %edx
+; FALLBACK24-NEXT:    orl %eax, %edx
 ; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movl %eax, %edx
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    addl %eax, %eax
-; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
-; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    movb %bl, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    addl %ebp, %ebp
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK24-NEXT:    movl 92(%esp,%eax), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %edx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 96(%esp,%eax), %edi
 ; FALLBACK24-NEXT:    leal (%edi,%edi), %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    orl %edx, %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    movb %bl, %cl
+; FALLBACK24-NEXT:    movl %ebx, %eax
+; FALLBACK24-NEXT:    shrl %cl, %esi
+; FALLBACK24-NEXT:    addl %ebp, %ebp
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %eax, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK24-NEXT:    movl %ebx, %ebp
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK24-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %esi, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 100(%esp,%ebx), %ebp
+; FALLBACK24-NEXT:    movl %ebp, %edx
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl 104(%esp,%ebx), %esi
+; FALLBACK24-NEXT:    leal (%esi,%esi), %eax
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %eax
-; FALLBACK24-NEXT:    orl %ebp, %eax
+; FALLBACK24-NEXT:    orl %edx, %eax
 ; FALLBACK24-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK24-NEXT:    movb %dl, %cl
 ; FALLBACK24-NEXT:    shrl %cl, %edi
-; FALLBACK24-NEXT:    addl %ebx, %ebx
+; FALLBACK24-NEXT:    addl %ebp, %ebp
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %edi, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK24-NEXT:    movl %edi, %ebp
-; FALLBACK24-NEXT:    movl %eax, %ecx
-; FALLBACK24-NEXT:    shrl %cl, %ebp
-; FALLBACK24-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %edi, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl 108(%esp,%ebx), %edi
+; FALLBACK24-NEXT:    movl %edi, %eax
+; FALLBACK24-NEXT:    movl %edx, %ecx
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 112(%esp,%ebx), %ecx
 ; FALLBACK24-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK24-NEXT:    leal (%ecx,%ecx), %ebp
 ; FALLBACK24-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %ebx
-; FALLBACK24-NEXT:    orl %ebp, %ebx
-; FALLBACK24-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %esi
 ; FALLBACK24-NEXT:    addl %edi, %edi
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %edi
-; FALLBACK24-NEXT:    orl %edx, %edi
-; FALLBACK24-NEXT:    movl %esi, %edx
-; FALLBACK24-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK24-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK24-NEXT:    movl %esi, %ebx
-; FALLBACK24-NEXT:    movb %al, %cl
-; FALLBACK24-NEXT:    shrl %cl, %ebx
-; FALLBACK24-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK24-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK24-NEXT:    orl %esi, %edi
+; FALLBACK24-NEXT:    movl 116(%esp,%ebx), %esi
+; FALLBACK24-NEXT:    movl %esi, %eax
+; FALLBACK24-NEXT:    movb %dl, %cl
+; FALLBACK24-NEXT:    shrl %cl, %eax
+; FALLBACK24-NEXT:    movl 120(%esp,%ebx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %ebp
-; FALLBACK24-NEXT:    orl %ebx, %ebp
+; FALLBACK24-NEXT:    orl %eax, %ebp
+; FALLBACK24-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK24-NEXT:    movl %edx, %eax
+; FALLBACK24-NEXT:    movb %al, %cl
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK24-NEXT:    shrl %cl, %ebx
+; FALLBACK24-NEXT:    shrl %cl, %edx
 ; FALLBACK24-NEXT:    addl %esi, %esi
 ; FALLBACK24-NEXT:    movb %ch, %cl
 ; FALLBACK24-NEXT:    shll %cl, %esi
-; FALLBACK24-NEXT:    orl %ebx, %esi
-; FALLBACK24-NEXT:    movb %dl, %cl
-; FALLBACK24-NEXT:    shrl %cl, %eax
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK24-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK24-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK24-NEXT:    orl %edx, %esi
+; FALLBACK24-NEXT:    movb %al, %cl
+; FALLBACK24-NEXT:    movl %ebx, %edx
+; FALLBACK24-NEXT:    shrl %cl, %edx
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK24-NEXT:    movl 124(%esp,%ebx), %ebx
+; FALLBACK24-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK24-NEXT:    movb %ch, %cl
-; FALLBACK24-NEXT:    shll %cl, %edx
-; FALLBACK24-NEXT:    orl %eax, %edx
-; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK24-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK24-NEXT:    shll %cl, %ebp
+; FALLBACK24-NEXT:    orl %edx, %ebp
+; FALLBACK24-NEXT:    movl %eax, %ecx
 ; FALLBACK24-NEXT:    sarl %cl, %ebx
 ; FALLBACK24-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK24-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK24-NEXT:    movl %edx, 56(%eax)
+; FALLBACK24-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK24-NEXT:    movl %esi, 48(%eax)
-; FALLBACK24-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK24-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK24-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK24-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK24-NEXT:    movl %ecx, 44(%eax)
@@ -23197,91 +23220,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK25-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK25-NEXT:    movl %ecx, %ebp
-; FALLBACK25-NEXT:    andl $60, %ebp
-; FALLBACK25-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK25-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shll $3, %ecx
-; FALLBACK25-NEXT:    andl $24, %ecx
-; FALLBACK25-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK25-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK25-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %esi
-; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK25-NEXT:    movl %ecx, %eax
+; FALLBACK25-NEXT:    andl $60, %eax
+; FALLBACK25-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK25-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK25-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edx
+; FALLBACK25-NEXT:    shll $3, %ecx
 ; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK25-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK25-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %edi
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK25-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edx
-; FALLBACK25-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK25-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK25-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edx
-; FALLBACK25-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK25-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl %esi, %edx
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK25-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edi
-; FALLBACK25-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; FALLBACK25-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK25-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    movl %eax, %edi
 ; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK25-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK25-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK25-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK25-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %esi
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK25-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl %edi, %esi
 ; FALLBACK25-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK25-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK25-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK25-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK25-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK25-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK25-NEXT:    movl %edx, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK25-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK25-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK25-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK25-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK25-NEXT:    movl %esi, 56(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK25-NEXT:    shrdl %cl, %esi, %ebp
 ; FALLBACK25-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK25-NEXT:    sarl %cl, %eax
-; FALLBACK25-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK25-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK25-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK25-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK25-NEXT:    movl %ebx, (%ebp)
-; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK25-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK25-NEXT:    sarl %cl, %edx
+; FALLBACK25-NEXT:    movl %edx, 60(%eax)
+; FALLBACK25-NEXT:    movl %edi, 48(%eax)
+; FALLBACK25-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK25-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK25-NEXT:    movl %ebp, (%eax)
+; FALLBACK25-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK25-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK25-NEXT:    addl $188, %esp
 ; FALLBACK25-NEXT:    popl %esi
 ; FALLBACK25-NEXT:    popl %edi
@@ -23331,7 +23353,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK26-NEXT:    movl %eax, %ecx
 ; FALLBACK26-NEXT:    leal (,%eax,8), %edx
-; FALLBACK26-NEXT:    andl $24, %edx
 ; FALLBACK26-NEXT:    andl $60, %ecx
 ; FALLBACK26-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK26-NEXT:    movl 72(%esp,%ecx), %edi
@@ -23339,6 +23360,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK26-NEXT:    shrxl %edx, %esi, %eax
 ; FALLBACK26-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK26-NEXT:    movl %edx, %ebx
+; FALLBACK26-NEXT:    andb $24, %bl
 ; FALLBACK26-NEXT:    notb %bl
 ; FALLBACK26-NEXT:    leal (%edi,%edi), %ebp
 ; FALLBACK26-NEXT:    shlxl %ebx, %ebp, %eax
@@ -23512,7 +23534,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK27-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK27-NEXT:    shll $3, %ecx
-; FALLBACK27-NEXT:    andl $24, %ecx
 ; FALLBACK27-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK27-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK27-NEXT:    movl 64(%esp,%ebp), %edi
@@ -23604,191 +23625,195 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK28-NEXT:    pushl %edi
 ; FALLBACK28-NEXT:    pushl %esi
 ; FALLBACK28-NEXT:    subl $204, %esp
-; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; FALLBACK28-NEXT:    vmovups (%ecx), %ymm0
-; FALLBACK28-NEXT:    vmovups 32(%ecx), %xmm1
-; FALLBACK28-NEXT:    movl 48(%ecx), %edx
-; FALLBACK28-NEXT:    movl 52(%ecx), %esi
-; FALLBACK28-NEXT:    movl 56(%ecx), %edi
-; FALLBACK28-NEXT:    movl 60(%ecx), %ecx
-; FALLBACK28-NEXT:    movl (%eax), %eax
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK28-NEXT:    vmovups (%eax), %ymm0
+; FALLBACK28-NEXT:    vmovups 32(%eax), %xmm1
+; FALLBACK28-NEXT:    movl 48(%eax), %edx
+; FALLBACK28-NEXT:    movl 52(%eax), %esi
+; FALLBACK28-NEXT:    movl 56(%eax), %edi
+; FALLBACK28-NEXT:    movl 60(%eax), %eax
+; FALLBACK28-NEXT:    movl (%ecx), %ecx
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovaps %xmm1, {{[0-9]+}}(%esp)
 ; FALLBACK28-NEXT:    vmovups %ymm0, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    sarl $31, %ecx
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; FALLBACK28-NEXT:    movl %eax, %esi
-; FALLBACK28-NEXT:    andl $60, %esi
-; FALLBACK28-NEXT:    movl 68(%esp,%esi), %edx
-; FALLBACK28-NEXT:    shll $3, %eax
-; FALLBACK28-NEXT:    andl $24, %eax
-; FALLBACK28-NEXT:    movl %edx, %edi
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    movl 72(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    sarl $31, %eax
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; FALLBACK28-NEXT:    movl %ecx, %edx
+; FALLBACK28-NEXT:    movl %ecx, %eax
+; FALLBACK28-NEXT:    andl $60, %eax
+; FALLBACK28-NEXT:    movl 68(%esp,%eax), %edi
+; FALLBACK28-NEXT:    movl %eax, %ebx
+; FALLBACK28-NEXT:    shll $3, %edx
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 72(%esp,%ebx), %ecx
 ; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
-; FALLBACK28-NEXT:    movb %al, %ch
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebp
+; FALLBACK28-NEXT:    movb %dl, %ch
+; FALLBACK28-NEXT:    andb $24, %ch
 ; FALLBACK28-NEXT:    notb %ch
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 64(%esp,%esi), %edi
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 64(%esp,%ebx), %eax
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %edi, %edi
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %edi, %edx
+; FALLBACK28-NEXT:    shll %cl, %edi
+; FALLBACK28-NEXT:    orl %eax, %edi
+; FALLBACK28-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 76(%esp,%ebx), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 80(%esp,%ebx), %edi
+; FALLBACK28-NEXT:    leal (%edi,%edi), %esi
+; FALLBACK28-NEXT:    movb %ch, %cl
+; FALLBACK28-NEXT:    shll %cl, %esi
+; FALLBACK28-NEXT:    orl %eax, %esi
+; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 76(%esp,%esi), %edx
-; FALLBACK28-NEXT:    movl %edx, %ebp
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 80(%esp,%esi), %edi
-; FALLBACK28-NEXT:    leal (%edi,%edi), %ebx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    addl %ebp, %ebp
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    addl %edx, %edx
+; FALLBACK28-NEXT:    movl 84(%esp,%ebx), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %eax
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 88(%esp,%ebx), %esi
+; FALLBACK28-NEXT:    leal (%esi,%esi), %edx
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %ebx, %edx
+; FALLBACK28-NEXT:    orl %eax, %edx
 ; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 84(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movl %eax, %edx
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 88(%esp,%esi), %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    addl %eax, %eax
-; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
-; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    movb %bl, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    addl %ebp, %ebp
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 92(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 96(%esp,%esi), %edi
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; FALLBACK28-NEXT:    movl 92(%esp,%eax), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %edx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 96(%esp,%eax), %edi
 ; FALLBACK28-NEXT:    leal (%edi,%edi), %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    orl %edx, %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    movb %bl, %cl
+; FALLBACK28-NEXT:    movl %ebx, %eax
+; FALLBACK28-NEXT:    shrl %cl, %esi
+; FALLBACK28-NEXT:    addl %ebp, %ebp
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %eax, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 100(%esp,%esi), %ebx
-; FALLBACK28-NEXT:    movl %ebx, %ebp
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 104(%esp,%esi), %edx
-; FALLBACK28-NEXT:    leal (%edx,%edx), %eax
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %esi, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 100(%esp,%ebx), %ebp
+; FALLBACK28-NEXT:    movl %ebp, %edx
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl 104(%esp,%ebx), %esi
+; FALLBACK28-NEXT:    leal (%esi,%esi), %eax
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %eax
-; FALLBACK28-NEXT:    orl %ebp, %eax
+; FALLBACK28-NEXT:    orl %edx, %eax
 ; FALLBACK28-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK28-NEXT:    movb %dl, %cl
 ; FALLBACK28-NEXT:    shrl %cl, %edi
-; FALLBACK28-NEXT:    addl %ebx, %ebx
+; FALLBACK28-NEXT:    addl %ebp, %ebp
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %edi, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 108(%esp,%esi), %edi
-; FALLBACK28-NEXT:    movl %edi, %ebp
-; FALLBACK28-NEXT:    movl %eax, %ecx
-; FALLBACK28-NEXT:    shrl %cl, %ebp
-; FALLBACK28-NEXT:    movl 112(%esp,%esi), %ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %edi, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl 108(%esp,%ebx), %edi
+; FALLBACK28-NEXT:    movl %edi, %eax
+; FALLBACK28-NEXT:    movl %edx, %ecx
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 112(%esp,%ebx), %ecx
 ; FALLBACK28-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebx
+; FALLBACK28-NEXT:    leal (%ecx,%ecx), %ebp
 ; FALLBACK28-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %ebx
-; FALLBACK28-NEXT:    orl %ebp, %ebx
-; FALLBACK28-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %esi
 ; FALLBACK28-NEXT:    addl %edi, %edi
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %edi
-; FALLBACK28-NEXT:    orl %edx, %edi
-; FALLBACK28-NEXT:    movl %esi, %edx
-; FALLBACK28-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK28-NEXT:    movl 116(%esp,%esi), %esi
-; FALLBACK28-NEXT:    movl %esi, %ebx
-; FALLBACK28-NEXT:    movb %al, %cl
-; FALLBACK28-NEXT:    shrl %cl, %ebx
-; FALLBACK28-NEXT:    movl 120(%esp,%edx), %eax
-; FALLBACK28-NEXT:    leal (%eax,%eax), %ebp
+; FALLBACK28-NEXT:    orl %esi, %edi
+; FALLBACK28-NEXT:    movl 116(%esp,%ebx), %esi
+; FALLBACK28-NEXT:    movl %esi, %eax
+; FALLBACK28-NEXT:    movb %dl, %cl
+; FALLBACK28-NEXT:    shrl %cl, %eax
+; FALLBACK28-NEXT:    movl 120(%esp,%ebx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %ebp
-; FALLBACK28-NEXT:    orl %ebx, %ebp
+; FALLBACK28-NEXT:    orl %eax, %ebp
+; FALLBACK28-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK28-NEXT:    movl %edx, %eax
+; FALLBACK28-NEXT:    movb %al, %cl
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; FALLBACK28-NEXT:    shrl %cl, %ebx
+; FALLBACK28-NEXT:    shrl %cl, %edx
 ; FALLBACK28-NEXT:    addl %esi, %esi
 ; FALLBACK28-NEXT:    movb %ch, %cl
 ; FALLBACK28-NEXT:    shll %cl, %esi
-; FALLBACK28-NEXT:    orl %ebx, %esi
-; FALLBACK28-NEXT:    movb %dl, %cl
-; FALLBACK28-NEXT:    shrl %cl, %eax
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; FALLBACK28-NEXT:    movl 124(%esp,%edx), %ebx
-; FALLBACK28-NEXT:    leal (%ebx,%ebx), %edx
+; FALLBACK28-NEXT:    orl %edx, %esi
+; FALLBACK28-NEXT:    movb %al, %cl
+; FALLBACK28-NEXT:    movl %ebx, %edx
+; FALLBACK28-NEXT:    shrl %cl, %edx
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; FALLBACK28-NEXT:    movl 124(%esp,%ebx), %ebx
+; FALLBACK28-NEXT:    leal (%ebx,%ebx), %ebp
 ; FALLBACK28-NEXT:    movb %ch, %cl
-; FALLBACK28-NEXT:    shll %cl, %edx
-; FALLBACK28-NEXT:    orl %eax, %edx
-; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; FALLBACK28-NEXT:    # kill: def $cl killed $cl killed $ecx
+; FALLBACK28-NEXT:    shll %cl, %ebp
+; FALLBACK28-NEXT:    orl %edx, %ebp
+; FALLBACK28-NEXT:    movl %eax, %ecx
 ; FALLBACK28-NEXT:    sarl %cl, %ebx
 ; FALLBACK28-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; FALLBACK28-NEXT:    movl %ebx, 60(%eax)
-; FALLBACK28-NEXT:    movl %edx, 56(%eax)
+; FALLBACK28-NEXT:    movl %ebp, 56(%eax)
 ; FALLBACK28-NEXT:    movl %esi, 48(%eax)
-; FALLBACK28-NEXT:    movl %ebp, 52(%eax)
+; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK28-NEXT:    movl %ecx, 52(%eax)
 ; FALLBACK28-NEXT:    movl %edi, 40(%eax)
 ; FALLBACK28-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; FALLBACK28-NEXT:    movl %ecx, 44(%eax)
@@ -23859,91 +23884,90 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; FALLBACK29-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; FALLBACK29-NEXT:    movl %ecx, %ebp
-; FALLBACK29-NEXT:    andl $60, %ebp
-; FALLBACK29-NEXT:    movl 56(%esp,%ebp), %edx
-; FALLBACK29-NEXT:    movl 52(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shll $3, %ecx
-; FALLBACK29-NEXT:    andl $24, %ecx
-; FALLBACK29-NEXT:    shrdl %cl, %edx, %eax
-; FALLBACK29-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 64(%esp,%ebp), %edi
-; FALLBACK29-NEXT:    movl 60(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %esi
-; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
-; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
+; FALLBACK29-NEXT:    movl %ecx, %eax
+; FALLBACK29-NEXT:    andl $60, %eax
+; FALLBACK29-NEXT:    movl 56(%esp,%eax), %esi
+; FALLBACK29-NEXT:    movl 52(%esp,%eax), %edx
 ; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 72(%esp,%ebp), %esi
-; FALLBACK29-NEXT:    movl 68(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edx
+; FALLBACK29-NEXT:    shll $3, %ecx
 ; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
 ; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
+; FALLBACK29-NEXT:    movl 64(%esp,%eax), %ebx
+; FALLBACK29-NEXT:    movl 60(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %edi
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %edi
 ; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 80(%esp,%ebp), %edi
-; FALLBACK29-NEXT:    movl 76(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edx
-; FALLBACK29-NEXT:    shrdl %cl, %edi, %edx
-; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 72(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl 68(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 80(%esp,%eax), %ebx
+; FALLBACK29-NEXT:    movl 76(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %ebx, %esi
 ; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 88(%esp,%ebp), %esi
-; FALLBACK29-NEXT:    movl 84(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edx
-; FALLBACK29-NEXT:    shrdl %cl, %esi, %edx
-; FALLBACK29-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl %esi, %edx
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edi
-; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 96(%esp,%ebp), %esi
-; FALLBACK29-NEXT:    movl 92(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edi
-; FALLBACK29-NEXT:    shrdl %cl, %esi, %edi
-; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; FALLBACK29-NEXT:    movl 104(%esp,%ebp), %edx
-; FALLBACK29-NEXT:    movl 100(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    movl %eax, %edi
 ; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %esi
-; FALLBACK29-NEXT:    movl 48(%esp,%ebp), %ebx
-; FALLBACK29-NEXT:    movl 108(%esp,%ebp), %eax
-; FALLBACK29-NEXT:    shrdl %cl, %eax, %edx
-; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; FALLBACK29-NEXT:    movl %edx, 56(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 88(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl 84(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %esi
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %esi
+; FALLBACK29-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl %edi, %esi
 ; FALLBACK29-NEXT:    shrdl %cl, %edx, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 96(%esp,%eax), %edi
+; FALLBACK29-NEXT:    movl 92(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %edi, %ebx
+; FALLBACK29-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK29-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; FALLBACK29-NEXT:    movl 104(%esp,%eax), %esi
+; FALLBACK29-NEXT:    movl 100(%esp,%eax), %edx
+; FALLBACK29-NEXT:    movl %edx, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %ebx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %edi
+; FALLBACK29-NEXT:    movl 48(%esp,%eax), %ebp
+; FALLBACK29-NEXT:    movl 108(%esp,%eax), %edx
+; FALLBACK29-NEXT:    shrdl %cl, %edx, %esi
+; FALLBACK29-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; FALLBACK29-NEXT:    movl %esi, 56(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; FALLBACK29-NEXT:    shrdl %cl, %esi, %ebp
 ; FALLBACK29-NEXT:    # kill: def $cl killed $cl killed $ecx
-; FALLBACK29-NEXT:    sarl %cl, %eax
-; FALLBACK29-NEXT:    movl %eax, 60(%ebp)
-; FALLBACK29-NEXT:    movl %esi, 48(%ebp)
-; FALLBACK29-NEXT:    movl %edi, 52(%ebp)
-; FALLBACK29-NEXT:    movl (%esp), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 40(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 44(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 32(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 36(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 24(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 28(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 16(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 20(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 8(%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 12(%ebp)
-; FALLBACK29-NEXT:    movl %ebx, (%ebp)
-; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; FALLBACK29-NEXT:    movl %eax, 4(%ebp)
+; FALLBACK29-NEXT:    sarl %cl, %edx
+; FALLBACK29-NEXT:    movl %edx, 60(%eax)
+; FALLBACK29-NEXT:    movl %edi, 48(%eax)
+; FALLBACK29-NEXT:    movl %ebx, 52(%eax)
+; FALLBACK29-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 40(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 44(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 32(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 36(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 24(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 28(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 16(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 20(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 8(%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 12(%eax)
+; FALLBACK29-NEXT:    movl %ebp, (%eax)
+; FALLBACK29-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; FALLBACK29-NEXT:    movl %ecx, 4(%eax)
 ; FALLBACK29-NEXT:    addl $188, %esp
 ; FALLBACK29-NEXT:    popl %esi
 ; FALLBACK29-NEXT:    popl %edi
@@ -23993,7 +24017,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; FALLBACK30-NEXT:    movl %eax, %ecx
 ; FALLBACK30-NEXT:    leal (,%eax,8), %edx
-; FALLBACK30-NEXT:    andl $24, %edx
 ; FALLBACK30-NEXT:    andl $60, %ecx
 ; FALLBACK30-NEXT:    movl 68(%esp,%ecx), %esi
 ; FALLBACK30-NEXT:    movl 72(%esp,%ecx), %edi
@@ -24001,6 +24024,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK30-NEXT:    shrxl %edx, %esi, %eax
 ; FALLBACK30-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK30-NEXT:    movl %edx, %ebx
+; FALLBACK30-NEXT:    andb $24, %bl
 ; FALLBACK30-NEXT:    notb %bl
 ; FALLBACK30-NEXT:    leal (%edi,%edi), %ebp
 ; FALLBACK30-NEXT:    shlxl %ebx, %ebp, %eax
@@ -24174,7 +24198,6 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %byteOff.ptr, ptr %dst) nounwind {
 ; FALLBACK31-NEXT:    movl 52(%esp,%ebp), %eax
 ; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK31-NEXT:    shll $3, %ecx
-; FALLBACK31-NEXT:    andl $24, %ecx
 ; FALLBACK31-NEXT:    shrdl %cl, %edx, %eax
 ; FALLBACK31-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; FALLBACK31-NEXT:    movl 64(%esp,%ebp), %edi
diff --git a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
index 8c0873492ce402..372de784b5133d 100644
--- a/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
+++ b/llvm/test/CodeGen/X86/wide-scalar-shift-legalization.ll
@@ -3092,113 +3092,107 @@ define void @ashr_32bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: lshr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r13,%r13), %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: lshr_64bytes:
@@ -3206,7 +3200,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
@@ -3214,7 +3208,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -3227,39 +3221,38 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%r8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%r8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%r8), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%r8), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%r8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 56(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
@@ -3301,53 +3294,51 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
@@ -3362,7 +3353,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
@@ -3370,7 +3361,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
@@ -3383,38 +3374,37 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%r8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
@@ -3427,35 +3417,35 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
@@ -3463,7 +3453,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3483,174 +3473,166 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%edi), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%edi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edi), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edi), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%esi), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -3689,68 +3671,67 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %ebp
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %ebx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %esi
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
@@ -3849,8 +3830,7 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
@@ -3872,156 +3852,153 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ecx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ecx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -4058,149 +4035,150 @@ define void @lshr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $188, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 56(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 60(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 48(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 52(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
@@ -4221,40 +4199,40 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %esi, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    negl %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movslq %ecx, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rbx), %r8
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rbx), %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %r9
@@ -4331,33 +4309,32 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    negl %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movslq %eax, %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r9), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r9), %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, %rsi
@@ -4399,51 +4376,51 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    pushq %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl (%rsi), %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %esi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movslq %ecx, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -64(%rsp,%rsi), %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -56(%rsp,%rsi), %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -40(%rsp,%rsi), %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -48(%rsp,%rsi), %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r14, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -24(%rsp,%rsi), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %r15
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r10, %r12
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %r13d
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $63, %r13b
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r13b
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r9, %rbp
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -32(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r14, %r14
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
@@ -4453,23 +4430,23 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rcx, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rdi, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %rdi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrq %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %r13, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r12, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 40(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r14, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
@@ -4488,33 +4465,32 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 32(%rdi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %esi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %esi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movslq %eax, %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -48(%rsp,%r8), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -40(%rsp,%r8), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rsi
@@ -4574,14 +4550,15 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -4589,7 +4566,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -4613,197 +4591,198 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl %ecx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%edi), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %dl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    negl %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 176(%esp,%eax), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 56(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 52(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 40(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 44(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 32(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 36(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 56(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 16(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 20(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 8(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 12(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 4(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $204, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %edi
@@ -4885,7 +4864,6 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    subl %ebp, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
@@ -4977,46 +4955,48 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%ebp), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%ebp), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%ebp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5037,125 +5017,127 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %ebp, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    subl %eax, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esi), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $31, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%edi), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %edx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%ebp), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 12(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 16(%edi), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %edx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, (%esp), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 32(%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%edi), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%ebp), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%ebp), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %edx, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%edi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%edi), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%esi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%esi), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, 188(%esp,%ecx), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    negl %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, 188(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%edi), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %edi, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ebx, %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, (%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 56(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 48(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -5166,7 +5148,7 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 32(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 36(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 24(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 28(%eax)
@@ -5194,42 +5176,42 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    pushl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl $204, %esp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ebx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%ecx), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ebx), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ebx), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ebx), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%ebx), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%ebx), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ebx), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%ecx), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%ecx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%ecx), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%ecx), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%ecx), %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%ecx), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    xorps %xmm0, %xmm0
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
@@ -5254,48 +5236,47 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    leal {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %ebx, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    subl %ebp, %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 4(%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 8(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 12(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 16(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 20(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 24(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 28(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 32(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 36(%eax), %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edi, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 40(%eax), %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 44(%eax), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %ebx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
@@ -5303,8 +5284,8 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %esi, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 176(%esp,%ebx), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    negl %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 176(%esp,%ebp), %ebp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 60(%eax)
@@ -5313,13 +5294,13 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %ebp, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 48(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shldl %cl, %edx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 48(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -5356,117 +5337,111 @@ define void @shl_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-NO-SHLD-LABEL: ashr_64bytes:
 ; X64-NO-BMI2-NO-SHLD:       # %bb.0:
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    pushq %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq 8(%rdi), %rcx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 16(%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 24(%rdi), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 32(%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 40(%rdi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 48(%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq 56(%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl (%rsi), %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    sarq $63, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %r8d
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rsi, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%r8), %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rdi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r14,%r14), %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r11
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%r8), %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rbp,%rbp), %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r12
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbx, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rdi), %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r12
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rdi), %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r13,%r13), %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r15, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r15
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r12, %r15
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rbp
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%r8), %rdi
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rbx, %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rbp, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r14, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r13
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rdi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%rdi,%rdi), %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r13, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    sarq %cl, %rdi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rdi, 56(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, 48(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 32(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r12, 40(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 16(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 48(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 32(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r15, 40(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r11, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r12
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r13
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %r15
-; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rbp
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
 ; X64-NO-BMI2-HAVE-SHLD-LABEL: ashr_64bytes:
@@ -5474,7 +5449,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
@@ -5482,7 +5457,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
@@ -5490,7 +5465,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
@@ -5500,38 +5475,37 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %r8d
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%r8), %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%r8), %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%r8), %r10
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%r8), %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%r8), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %rax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    sarq %cl, %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 56(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %r15
@@ -5577,53 +5551,51 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %eax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r13
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r11, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rax), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rdi, %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rax), %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rax), %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r14, %r15
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r10, %rbp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %r12d
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %rbp
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %sil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    xorb $63, %r12b
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r9, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %r9
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbx, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rbx, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %r15
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r15, %r13
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -80(%rsp,%rax), %rbx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r13
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    sarxq %rcx, %rax, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r9, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r15,%r15), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r10, %r10
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r10, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r11, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %r11, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r15, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r14, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %r14, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rbp, %r11
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rsi, %rax, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %r12, %rax, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r13, %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 48(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r11, 32(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 40(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rbx, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 40(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 16(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r10, 24(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %r12
@@ -5638,7 +5610,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    pushq %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rcx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq (%rdi), %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 8(%rdi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 16(%rdi), %r9
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 24(%rdi), %r10
@@ -5646,7 +5618,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 40(%rdi), %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 48(%rdi), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq 56(%rdi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %eax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%rsi), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, -{{[0-9]+}}(%rsp)
@@ -5654,7 +5626,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, -{{[0-9]+}}(%rsp)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarq $63, %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
@@ -5664,37 +5636,36 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rax), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rax), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rax), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rax), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rax), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %r8d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%r8), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%r8), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%r8), %r9
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r9, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%r8), %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%r8), %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r10, %rbx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%rax), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%rax), %r14
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -80(%rsp,%r8), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -88(%rsp,%r8), %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r14, %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r11, %r15
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r14, %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%rax), %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %rax, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -72(%rsp,%r8), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxq %rcx, %r8, %rcx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 48(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 56(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rcx, 56(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 32(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r15, 40(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 16(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r14
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %r15
@@ -5711,7 +5682,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%eax), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, (%esp) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 12(%eax), %ecx
@@ -5758,7 +5729,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -5779,171 +5750,159 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %cl, (%esp) # 1-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebp), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl $3, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    xorb $31, %ch
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebp), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edx,%edx), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, {{[-0-9]+}}(%e{{[sb]}}p) # 1-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%eax), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%eax), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%eax), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebp), %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%eax), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb {{[-0-9]+}}(%e{{[sb]}}p), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl (%esp), %ecx # 1-byte Folded Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ecx,%ecx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb (%esp), %ch # 1-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebp), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebp), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %al, %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %ch, %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    sarl %cl, %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 60(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 60(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 48(%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 52(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 40(%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
@@ -6007,9 +5966,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
@@ -6025,7 +5984,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -6036,25 +5995,24 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
 ; X86-NO-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %esi
@@ -6173,42 +6131,40 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 36(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 40(%eax), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 44(%eax), %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 52(%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 56(%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 60(%eax), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl (%eax), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarl $31, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
@@ -6226,124 +6182,115 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notl %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %al
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %eax, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $31, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrl $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 68(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 72(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    xorb $31, %bl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, 64(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 80(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 76(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 88(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ebx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 84(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 96(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 92(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 104(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ebx), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 100(%esp,%ecx), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ebx), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 112(%esp,%ecx), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edi, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ebx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 108(%esp,%ecx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %esi, %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Folded Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ebp, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %esi, %edi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%ebx), %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Folded Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebx), %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %eax, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %ecx, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 120(%esp,%edi), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ecx, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 116(%esp,%edi), %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %eax, %ebx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %eax, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl {{[-0-9]+}}(%e{{[sb]}}p), %edi # 4-byte Folded Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %ebp, %eax
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ebp # 4-byte Reload
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 124(%esp,%ebp), %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    sarxl %edx, %ebp, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ebp, %ebp
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %ebp, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %eax, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edx, 60(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebx, 56(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ebp, 52(%eax)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 40(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 52(%eax)
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, 40(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, 44(%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
@@ -6406,9 +6353,9 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%eax), %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%eax), %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%eax), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%ecx), %ecx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[0-9]+}}(%esp)
@@ -6424,7 +6371,7 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
@@ -6435,106 +6382,107 @@ define void @ashr_64bytes(ptr %src.ptr, ptr %bitOff.ptr, ptr %dst) nounwind {
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, {{[0-9]+}}(%esp)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $31, %ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarl $31, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[0-9]+}}(%esp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrl $3, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $60, %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 56(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 52(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 64(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 60(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 72(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 68(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 80(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 76(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, (%esp) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%ebp), %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %edi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 88(%esp,%eax), %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 84(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%esp) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 96(%esp,%eax), %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 92(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %ebp
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%eax), %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %ebx, %edi
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%eax), %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%eax), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 56(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, 48(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 52(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebp, 40(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 44(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 32(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 36(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 24(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 28(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 16(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 20(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 8(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 12(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Folded Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 104(%esp,%ebp), %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 100(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edi, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %esi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 48(%esp,%ebp), %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl 108(%esp,%ebp), %edi
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ebp
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 56(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    sarxl %ecx, %edi, %eax
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 60(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %esi, 48(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 52(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 40(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 44(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 32(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 36(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 24(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 28(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 16(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 20(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 8(%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 12(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, 4(%eax)
-; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %edx, 60(%eax)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdl %cl, %eax, %ebx
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ebx, (%ebp)
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, 4(%ebp)
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    addl $188, %esp
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %esi
 ; X86-HAVE-BMI2-HAVE-SHLD-NEXT:    popl %edi
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
index 044be12a395433..65885750f8c963 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca-with-zero-upper-half.ll
@@ -434,8 +434,9 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -448,18 +449,19 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orb %dl, %bl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca_with_zero_upper_half:
@@ -505,12 +507,12 @@ define void @load_1byte_chunk_of_16byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orb %dl, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
@@ -966,18 +968,25 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    xorps %xmm1, %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NEXT:    movzbl %cl, %esi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rsi,8), %edi
+; X64-NO-BMI2-NEXT:    addl %edi, %edi
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orb %sil, %dil
+; X64-NO-BMI2-NEXT:    movb %dil, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -992,14 +1001,22 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orb %al, %cl
+; X64-BMI2-NEXT:    movb %cl, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $68, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1012,18 +1029,19 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orb %dl, %bl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $68, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca_with_zero_upper_half:
@@ -1069,12 +1087,12 @@ define void @load_1byte_chunk_of_32byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orb %dl, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
@@ -1891,18 +1909,20 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
-; X64-NO-BMI2-NEXT:    andl $56, %ecx
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %eax
+; X64-NO-BMI2-NEXT:    # kill: def $esi killed $esi killed $rsi def $rsi
 ; X64-NO-BMI2-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rax
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
-; X64-NO-BMI2-NEXT:    addl %esi, %esi
-; X64-NO-BMI2-NEXT:    notl %ecx
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
-; X64-NO-BMI2-NEXT:    orl %eax, %esi
-; X64-NO-BMI2-NEXT:    movb %sil, (%rdx)
+; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %edi
+; X64-NO-BMI2-NEXT:    addl %edi, %edi
+; X64-NO-BMI2-NEXT:    andl $56, %eax
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NEXT:    movq -128(%rsp,%rsi), %rsi
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orb %sil, %dil
+; X64-NO-BMI2-NEXT:    movb %dil, (%rdx)
 ; X64-NO-BMI2-NEXT:    popq %rax
 ; X64-NO-BMI2-NEXT:    retq
 ;
@@ -1924,19 +1944,19 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
 ; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-BMI2-NEXT:    notb %al
 ; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
 ; X64-BMI2-NEXT:    addl %esi, %esi
 ; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
-; X64-BMI2-NEXT:    orl %eax, %ecx
+; X64-BMI2-NEXT:    orb %al, %cl
 ; X64-BMI2-NEXT:    movb %cl, (%rdx)
 ; X64-BMI2-NEXT:    popq %rax
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
-; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $136, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
@@ -1954,17 +1974,17 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edx,8), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orb %bl, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $136, %esp
-; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_64byte_alloca_with_zero_upper_half:
@@ -2014,14 +2034,15 @@ define void @load_1byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %edx, %ecx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shll $3, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orb %dl, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $136, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
@@ -2062,7 +2083,7 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
 ; X64-NO-BMI2-NEXT:    addl %esi, %esi
-; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    notb %cl
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NEXT:    orl %eax, %esi
@@ -2088,8 +2109,8 @@ define void @load_2byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
 ; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-BMI2-NEXT:    notb %al
 ; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
 ; X64-BMI2-NEXT:    addl %esi, %esi
 ; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
@@ -2225,7 +2246,7 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-NO-BMI2-NEXT:    shrq %cl, %rax
 ; X64-NO-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
 ; X64-NO-BMI2-NEXT:    addl %esi, %esi
-; X64-NO-BMI2-NEXT:    notl %ecx
+; X64-NO-BMI2-NEXT:    notb %cl
 ; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NO-BMI2-NEXT:    shlq %cl, %rsi
 ; X64-NO-BMI2-NEXT:    orl %eax, %esi
@@ -2251,8 +2272,8 @@ define void @load_4byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X64-BMI2-NEXT:    andl $56, %eax
 ; X64-BMI2-NEXT:    andl $56, %esi
 ; X64-BMI2-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
-; X64-BMI2-NEXT:    # kill: def $eax killed $eax killed $rax def $rax
-; X64-BMI2-NEXT:    notl %eax
+; X64-BMI2-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-BMI2-NEXT:    notb %al
 ; X64-BMI2-NEXT:    movl -120(%rsp,%rsi), %esi
 ; X64-BMI2-NEXT:    addl %esi, %esi
 ; X64-BMI2-NEXT:    shlxq %rax, %rsi, %rax
@@ -2452,10 +2473,10 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $140, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2465,27 +2486,26 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%ebx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ebx), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%edi,8), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edi), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ebx), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, (%ecx)
@@ -2502,11 +2522,11 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
 ; X86-SHLD-NEXT:    subl $128, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    movups (%edx), %xmm0
-; X86-SHLD-NEXT:    movups 16(%edx), %xmm1
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2516,19 +2536,17 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, (%esp)
-; X86-SHLD-NEXT:    movl %ecx, %esi
-; X86-SHLD-NEXT:    andl $60, %esi
-; X86-SHLD-NEXT:    movl 8(%esp,%esi), %edi
-; X86-SHLD-NEXT:    movl (%esp,%esi), %edx
-; X86-SHLD-NEXT:    movl 4(%esp,%esi), %esi
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    andl $24, %ecx
-; X86-SHLD-NEXT:    movl %esi, %ebx
-; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    leal (,%edx,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %edx
+; X86-SHLD-NEXT:    movl 8(%esp,%edx), %esi
+; X86-SHLD-NEXT:    movl (%esp,%edx), %edi
+; X86-SHLD-NEXT:    movl 4(%esp,%edx), %edx
+; X86-SHLD-NEXT:    movl %edx, %ebx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %edi
 ; X86-SHLD-NEXT:    movl %ebx, 4(%eax)
-; X86-SHLD-NEXT:    movl %edx, (%eax)
+; X86-SHLD-NEXT:    movl %edi, (%eax)
 ; X86-SHLD-NEXT:    addl $128, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
@@ -2556,12 +2574,12 @@ define void @load_8byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i6
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%ecx,8), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, (%esp,%ecx), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%ecx), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %edx, %edi, %ebx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $dl killed $dl killed $edx def $edx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 8(%esp,%ecx), %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
@@ -2606,27 +2624,26 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %rdi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $56, %cl
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %rdi, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %al
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %rsi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 8(%rdx)
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r10, (%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    popq %rax
 ; X64-NO-BMI2-NO-SHLD-NEXT:    retq
 ;
@@ -2645,23 +2662,20 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %edi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r9
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %cl
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shlq %cl, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r10, %rsi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r8
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r8, (%rdx)
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    popq %rax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    retq
@@ -2681,24 +2695,23 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $56, %dil
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %dil
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r8,%r8), %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rdi, %r9, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rax, %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r9, 8(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, (%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rax
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    retq
 ;
@@ -2717,21 +2730,20 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rdi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rdi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rsi, %r8
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rax, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rdi, %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r9d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r9b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r9, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r8, %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rsi, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r8, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rdi, %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, (%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rax
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
@@ -2742,10 +2754,10 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $156, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2755,37 +2767,38 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%esi,8), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 16(%esp,%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%esi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%esi), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
@@ -2795,8 +2808,7 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 12(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 8(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 4(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, 4(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, (%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl $156, %esp
@@ -2813,10 +2825,10 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
 ; X86-SHLD-NEXT:    subl $156, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movups (%eax), %xmm0
-; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -2826,27 +2838,25 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edi
-; X86-SHLD-NEXT:    andl $60, %edi
-; X86-SHLD-NEXT:    movl 24(%esp,%edi), %esi
-; X86-SHLD-NEXT:    movl 16(%esp,%edi), %eax
-; X86-SHLD-NEXT:    movl 20(%esp,%edi), %ebx
+; X86-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %eax
+; X86-SHLD-NEXT:    movl 24(%esp,%eax), %edi
+; X86-SHLD-NEXT:    movl 16(%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl 20(%esp,%eax), %ebx
 ; X86-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    andl $24, %ecx
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebx
-; X86-SHLD-NEXT:    movl 28(%esp,%edi), %ebp
-; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
-; X86-SHLD-NEXT:    movl 32(%esp,%edi), %edi
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebp
-; X86-SHLD-NEXT:    movl %ebp, 12(%edx)
-; X86-SHLD-NEXT:    movl %esi, 8(%edx)
-; X86-SHLD-NEXT:    movl %ebx, 4(%edx)
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebx
+; X86-SHLD-NEXT:    movl 28(%esp,%eax), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %edi
+; X86-SHLD-NEXT:    movl 32(%esp,%eax), %eax
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %ebp
+; X86-SHLD-NEXT:    movl %ebp, 12(%esi)
+; X86-SHLD-NEXT:    movl %edi, 8(%esi)
+; X86-SHLD-NEXT:    movl %ebx, 4(%esi)
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %esi # 4-byte Reload
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %eax
-; X86-SHLD-NEXT:    movl %eax, (%edx)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
+; X86-SHLD-NEXT:    movl %edx, (%esi)
 ; X86-SHLD-NEXT:    addl $156, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
@@ -2875,13 +2885,13 @@ define void @load_16byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edx, %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
@@ -2942,43 +2952,42 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %edi
 ; X64-NO-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r8
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-NO-SHLD-NEXT:    andb $56, %dil
+; X64-NO-BMI2-NO-SHLD-NEXT:    notb %dil
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r9
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r8, %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %r8d
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %r8d
 ; X64-NO-BMI2-NO-SHLD-NEXT:    notb %r8b
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r9
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %rbx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r9
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    notl %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rbx
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rbx
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    leaq (%r11,%r11), %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
 ; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %r14
-; X64-NO-BMI2-NO-SHLD-NEXT:    movl %edi, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r11
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %r14
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r11, %rsi
-; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-NO-BMI2-NO-SHLD-NEXT:    shrq %cl, %r10
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    addq %rax, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movl %r8d, %ecx
+; X64-NO-BMI2-NO-SHLD-NEXT:    shlq %cl, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    orq %r10, %rax
+; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rax, 24(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r14, 16(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %rbx, 8(%rdx)
 ; X64-NO-BMI2-NO-SHLD-NEXT:    movq %r9, (%rdx)
@@ -3003,17 +3012,15 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %eax
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %eax
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %r8
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r9
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq %r9, %r10
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    shrq %cl, %r10
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    notl %edi
-; X64-NO-BMI2-HAVE-SHLD-NEXT:    andl $63, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %eax, %edi
+; X64-NO-BMI2-HAVE-SHLD-NEXT:    notb %dil
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r11
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
 ; X64-NO-BMI2-HAVE-SHLD-NEXT:    movl %edi, %ecx
@@ -3057,36 +3064,35 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %ecx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, -128(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, -128(%rsp,%rsi), %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r8, %r10
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %r9, %r11
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %rbx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rcx, %rbx, %r14
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx def $rcx
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -112(%rsp,%rsi), %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %rdi, %r9
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -104(%rsp,%rsi), %r10
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r8, %r11
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shrxq %rax, %r10, %r14
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    # kill: def $al killed $al killed $rax def $rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    andb $56, %al
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %al
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r8, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rdi, %r8
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    notl %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r9,%r9), %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rdi, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r10, %rdi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%rbx,%rbx), %r9
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rcx, %r9, %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %r8, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %rcx, %r8
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    leaq (%r10,%r10), %rcx
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rcx, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r9, %rax
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rdi, %rdi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %rdi, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r11, %rcx
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rax
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 16(%rdx)
-; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rdi, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    shlxq %rbx, %rsi, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    orq %r14, %rsi
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rcx, 8(%rdx)
+; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %rax, 16(%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    movq %r8, (%rdx)
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    addq $8, %rsp
 ; X64-HAVE-BMI2-NO-SHLD-NEXT:    popq %rbx
@@ -3107,32 +3113,30 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %eax, %ecx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %ecx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leal (,%rsi,8), %ecx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $56, %esi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %r8
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -120(%rsp,%rsi), %rax
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r8
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r8, %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notl %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    andl $63, %eax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r10
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r10,%r10), %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %r11, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %r11
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -104(%rsp,%rsi), %r9
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrxq %rcx, %r9, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movl %ecx, %r10d
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    notb %r10b
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -112(%rsp,%rsi), %r11
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    leaq (%r11,%r11), %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rbx, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rdi, %rbx
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -128(%rsp,%rsi), %rdi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq -96(%rsp,%rsi), %rsi
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    addq %rsi, %rsi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %rax, %rsi, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %rbx, %rax
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r9, %r10
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shlxq %r10, %rsi, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    orq %r9, %rsi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %r11
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %r8, %rdi
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r10, 16(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    shrdq %cl, %rax, %rdi
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 16(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rdi, (%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rax, 24(%rdx)
-; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %r11, 8(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rsi, 24(%rdx)
+; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    movq %rbx, 8(%rdx)
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    popq %rbx
 ; X64-HAVE-BMI2-HAVE-SHLD-NEXT:    retq
 ;
@@ -3143,10 +3147,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    subl $172, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %esi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%ecx), %xmm0
-; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%ecx), %xmm1
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups (%eax), %xmm0
+; X86-NO-BMI2-NO-SHLD-NEXT:    movups 16(%eax), %xmm1
 ; X86-NO-BMI2-NO-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -3156,80 +3160,83 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %edi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll $3, %eax
-; X86-NO-BMI2-NO-SHLD-NEXT:    andl $24, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (,%esi,8), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    andl $60, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 32(%esp,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 36(%esp,%esi), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    andb $24, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 40(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 44(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%edi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 48(%esp,%esi), %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%edi), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 52(%esp,%esi), %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%edi,%edi), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%edi), %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 56(%esp,%esi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%eax,%eax), %ebp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebp
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%edi), %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %edi, %ebp
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebx, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 60(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    leal (%ebx,%ebx), %edi
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebp, %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, %ecx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    orl %eax, %edi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %ecx # 4-byte Reload
+; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %ebx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%edi), %eax
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 64(%esp,%esi), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    addl %eax, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edx, %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    orl %ebx, %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 28(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %esi, 24(%ecx)
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 20(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %edi, 24(%ecx)
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ebp, 20(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %eax, 16(%ecx)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
@@ -3254,10 +3261,10 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-SHLD-NEXT:    pushl %edi
 ; X86-SHLD-NEXT:    pushl %esi
 ; X86-SHLD-NEXT:    subl $156, %esp
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-SHLD-NEXT:    movups (%eax), %xmm0
-; X86-SHLD-NEXT:    movups 16(%eax), %xmm1
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-SHLD-NEXT:    movups (%ecx), %xmm0
+; X86-SHLD-NEXT:    movups 16(%ecx), %xmm1
 ; X86-SHLD-NEXT:    xorps %xmm2, %xmm2
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
@@ -3267,47 +3274,45 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-SHLD-NEXT:    movaps %xmm2, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
-; X86-SHLD-NEXT:    movl %ecx, %edi
-; X86-SHLD-NEXT:    andl $60, %edi
-; X86-SHLD-NEXT:    movl 24(%esp,%edi), %edx
-; X86-SHLD-NEXT:    movl 20(%esp,%edi), %eax
-; X86-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    shll $3, %ecx
-; X86-SHLD-NEXT:    andl $24, %ecx
-; X86-SHLD-NEXT:    movl %eax, %esi
-; X86-SHLD-NEXT:    movl %edx, %eax
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    movl 28(%esp,%edi), %edx
-; X86-SHLD-NEXT:    shrdl %cl, %edx, %eax
-; X86-SHLD-NEXT:    movl %eax, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
-; X86-SHLD-NEXT:    movl 32(%esp,%edi), %ebp
-; X86-SHLD-NEXT:    shrdl %cl, %ebp, %edx
-; X86-SHLD-NEXT:    movl %edx, (%esp) # 4-byte Spill
-; X86-SHLD-NEXT:    movl 36(%esp,%edi), %esi
-; X86-SHLD-NEXT:    shrdl %cl, %esi, %ebp
-; X86-SHLD-NEXT:    movl 40(%esp,%edi), %edx
+; X86-SHLD-NEXT:    leal (,%eax,8), %ecx
+; X86-SHLD-NEXT:    andl $60, %eax
+; X86-SHLD-NEXT:    movl 24(%esp,%eax), %esi
+; X86-SHLD-NEXT:    movl 20(%esp,%eax), %edx
+; X86-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movl %edx, %edi
+; X86-SHLD-NEXT:    movl %esi, %edx
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 28(%esp,%eax), %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edx
+; X86-SHLD-NEXT:    movl %edx, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 32(%esp,%eax), %ebp
+; X86-SHLD-NEXT:    shrdl %cl, %ebp, %esi
+; X86-SHLD-NEXT:    movl %esi, (%esp) # 4-byte Spill
+; X86-SHLD-NEXT:    movl 36(%esp,%eax), %edi
+; X86-SHLD-NEXT:    shrdl %cl, %edi, %ebp
+; X86-SHLD-NEXT:    movl 40(%esp,%eax), %esi
+; X86-SHLD-NEXT:    shrdl %cl, %esi, %edi
+; X86-SHLD-NEXT:    movl 44(%esp,%eax), %edx
 ; X86-SHLD-NEXT:    shrdl %cl, %edx, %esi
-; X86-SHLD-NEXT:    movl 44(%esp,%edi), %eax
+; X86-SHLD-NEXT:    movl 16(%esp,%eax), %ebx
+; X86-SHLD-NEXT:    movl 48(%esp,%eax), %eax
 ; X86-SHLD-NEXT:    shrdl %cl, %eax, %edx
-; X86-SHLD-NEXT:    movl 16(%esp,%edi), %ebx
-; X86-SHLD-NEXT:    movl 48(%esp,%edi), %edi
-; X86-SHLD-NEXT:    shrdl %cl, %edi, %eax
-; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-SHLD-NEXT:    movl %eax, 28(%edi)
-; X86-SHLD-NEXT:    movl %edx, 24(%edi)
-; X86-SHLD-NEXT:    movl %esi, 20(%edi)
-; X86-SHLD-NEXT:    movl %ebp, 16(%edi)
-; X86-SHLD-NEXT:    movl (%esp), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    movl %eax, 12(%edi)
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    movl %eax, 8(%edi)
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    movl %eax, 4(%edi)
+; X86-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-SHLD-NEXT:    movl %edx, 28(%eax)
+; X86-SHLD-NEXT:    movl %esi, 24(%eax)
+; X86-SHLD-NEXT:    movl %edi, 20(%eax)
+; X86-SHLD-NEXT:    movl %ebp, 16(%eax)
+; X86-SHLD-NEXT:    movl (%esp), %edx # 4-byte Reload
+; X86-SHLD-NEXT:    movl %edx, 12(%eax)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SHLD-NEXT:    movl %edx, 8(%eax)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SHLD-NEXT:    movl %edx, 4(%eax)
 ; X86-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %eax # 4-byte Reload
-; X86-SHLD-NEXT:    shrdl %cl, %eax, %ebx
-; X86-SHLD-NEXT:    movl %ebx, (%edi)
+; X86-SHLD-NEXT:    movl {{[-0-9]+}}(%e{{[sb]}}p), %edx # 4-byte Reload
+; X86-SHLD-NEXT:    shrdl %cl, %edx, %ebx
+; X86-SHLD-NEXT:    movl %ebx, (%eax)
 ; X86-SHLD-NEXT:    addl $156, %esp
 ; X86-SHLD-NEXT:    popl %esi
 ; X86-SHLD-NEXT:    popl %edi
@@ -3336,23 +3341,23 @@ define void @load_32byte_chunk_of_64byte_alloca_with_zero_upper_half(ptr %src, i
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm1, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movaps %xmm0, {{[0-9]+}}(%esp)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (,%eax,8), %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $24, %ecx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andl $60, %eax
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, 16(%esp,%eax), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 20(%esp,%eax), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %edi, %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %ebx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $24, %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %bl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 24(%esp,%eax), %ebp
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edx, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, %ebp, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %esi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %edi, %esi
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %esi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%ebp,%ebp), %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %edi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %edi, {{[-0-9]+}}(%e{{[sb]}}p) # 4-byte Spill
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 28(%esp,%eax), %esi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    leal (%esi,%esi), %edi
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ebx, %edi, %edi
diff --git a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
index ff13f4ba577f2e..47094f4c91daae 100644
--- a/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
+++ b/llvm/test/CodeGen/X86/widen-load-of-small-alloca.ll
@@ -605,8 +605,9 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $36, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -618,18 +619,19 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-NO-BMI2-NO-SHLD-NEXT:    andb $12, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $40, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orb %dl, %bl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $36, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_16byte_alloca:
@@ -673,12 +675,12 @@ define void @load_1byte_chunk_of_16byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $3, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    andb $12, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orb %dl, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $40, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi
@@ -1224,19 +1226,26 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-NO-BMI2:       # %bb.0:
 ; X64-NO-BMI2-NEXT:    movups (%rdi), %xmm0
 ; X64-NO-BMI2-NEXT:    movups 16(%rdi), %xmm1
-; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %ecx
+; X64-NO-BMI2-NEXT:    leal (,%rsi,8), %eax
 ; X64-NO-BMI2-NEXT:    xorps %xmm2, %xmm2
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm2, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm1, -{{[0-9]+}}(%rsp)
 ; X64-NO-BMI2-NEXT:    movaps %xmm0, -{{[0-9]+}}(%rsp)
-; X64-NO-BMI2-NEXT:    movl %ecx, %eax
-; X64-NO-BMI2-NEXT:    shrb $6, %al
-; X64-NO-BMI2-NEXT:    movzbl %al, %eax
-; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rax,8), %rax
-; X64-NO-BMI2-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NO-BMI2-NEXT:    shrq %cl, %rax
-; X64-NO-BMI2-NEXT:    movb %al, (%rdx)
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    shrb $6, %cl
+; X64-NO-BMI2-NEXT:    movzbl %cl, %esi
+; X64-NO-BMI2-NEXT:    movl -64(%rsp,%rsi,8), %edi
+; X64-NO-BMI2-NEXT:    addl %edi, %edi
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    andb $56, %cl
+; X64-NO-BMI2-NEXT:    notb %cl
+; X64-NO-BMI2-NEXT:    shlq %cl, %rdi
+; X64-NO-BMI2-NEXT:    movq -72(%rsp,%rsi,8), %rsi
+; X64-NO-BMI2-NEXT:    movl %eax, %ecx
+; X64-NO-BMI2-NEXT:    shrq %cl, %rsi
+; X64-NO-BMI2-NEXT:    orb %sil, %dil
+; X64-NO-BMI2-NEXT:    movb %dil, (%rdx)
 ; X64-NO-BMI2-NEXT:    retq
 ;
 ; X64-BMI2-LABEL: load_1byte_chunk_of_32byte_alloca:
@@ -1252,14 +1261,22 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X64-BMI2-NEXT:    movl %esi, %eax
 ; X64-BMI2-NEXT:    shrb $6, %al
 ; X64-BMI2-NEXT:    movzbl %al, %eax
-; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rax
-; X64-BMI2-NEXT:    movb %al, (%rdx)
+; X64-BMI2-NEXT:    shrxq %rsi, -72(%rsp,%rax,8), %rcx
+; X64-BMI2-NEXT:    # kill: def $sil killed $sil killed $rsi def $rsi
+; X64-BMI2-NEXT:    andb $56, %sil
+; X64-BMI2-NEXT:    notb %sil
+; X64-BMI2-NEXT:    movl -64(%rsp,%rax,8), %eax
+; X64-BMI2-NEXT:    addl %eax, %eax
+; X64-BMI2-NEXT:    shlxq %rsi, %rax, %rax
+; X64-BMI2-NEXT:    orb %al, %cl
+; X64-BMI2-NEXT:    movb %cl, (%rdx)
 ; X64-BMI2-NEXT:    retq
 ;
 ; X86-NO-BMI2-NO-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
 ; X86-NO-BMI2-NO-SHLD:       # %bb.0:
+; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    pushl %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    subl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    subl $68, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl {{[0-9]+}}(%esp), %edx
@@ -1273,18 +1290,19 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movaps %xmm0, (%esp)
 ; X86-NO-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
-; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%edx,4), %esi
-; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movzbl %dl, %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl (%esp,%esi,4), %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%esi,4), %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    shrl %cl, %edx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl %ebx, %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    orl %esi, %edx
-; X86-NO-BMI2-NO-SHLD-NEXT:    movb %dl, (%eax)
-; X86-NO-BMI2-NO-SHLD-NEXT:    addl $72, %esp
+; X86-NO-BMI2-NO-SHLD-NEXT:    shll %cl, %ebx
+; X86-NO-BMI2-NO-SHLD-NEXT:    orb %dl, %bl
+; X86-NO-BMI2-NO-SHLD-NEXT:    movb %bl, (%eax)
+; X86-NO-BMI2-NO-SHLD-NEXT:    addl $68, %esp
 ; X86-NO-BMI2-NO-SHLD-NEXT:    popl %esi
+; X86-NO-BMI2-NO-SHLD-NEXT:    popl %ebx
 ; X86-NO-BMI2-NO-SHLD-NEXT:    retl
 ;
 ; X86-SHLD-LABEL: load_1byte_chunk_of_32byte_alloca:
@@ -1332,12 +1350,12 @@ define void @load_1byte_chunk_of_32byte_alloca(ptr %src, i64 %byteOff, ptr %dst)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl %ecx, %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrb $5, %dl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movzbl %dl, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shrxl %ecx, (%esp,%edx,4), %edx
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    notb %cl
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    movl 4(%esp,%edx,4), %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %edx, %edx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %edx, %ecx
-; X86-HAVE-BMI2-NO-SHLD-NEXT:    orl %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl %esi, %esi
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    shlxl %ecx, %esi, %ecx
+; X86-HAVE-BMI2-NO-SHLD-NEXT:    orb %dl, %cl
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    movb %cl, (%eax)
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    addl $72, %esp
 ; X86-HAVE-BMI2-NO-SHLD-NEXT:    popl %esi

>From 6109d0a3cd3eb140e0b4ccedb6b5551db466438b Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 23 Sep 2024 20:20:43 -0700
Subject: [PATCH 53/56] Update NVPTX isNarrowingProfitable API

---
 llvm/lib/Target/NVPTX/NVPTXISelLowering.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
index 32eee0cb301873..047212676d1701 100644
--- a/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXISelLowering.h
@@ -506,7 +506,7 @@ class NVPTXTargetLowering : public TargetLowering {
            DstTy->getPrimitiveSizeInBits() == 32;
   }
 
-  bool isNarrowingProfitable(EVT SrcVT, EVT DestVT) const override {
+  bool isNarrowingProfitable(SDNode *N, EVT SrcVT, EVT DestVT) const override {
     // Truncating 64-bit to 32-bit is free in SASS.
     if (!(SrcVT.isScalarInteger() && DestVT.isScalarInteger()))
       return false;

>From 71d2a8b89c25f2bab06845249bb571764ac8e967 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Mon, 23 Sep 2024 22:43:28 -0700
Subject: [PATCH 54/56] Update AMDGPU tests

---
 llvm/test/CodeGen/AMDGPU/anyext.ll            |   12 +-
 .../CodeGen/AMDGPU/combine-cond-add-sub.ll    |    3 +-
 .../CodeGen/AMDGPU/computeNumSignBits-mul.ll  |    6 +-
 llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll |   70 +-
 llvm/test/CodeGen/AMDGPU/idot8u.ll            | 1986 +++++++++--------
 .../CodeGen/AMDGPU/si-annotate-cf-kill.ll     |   52 +-
 llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll    |   20 +-
 llvm/test/CodeGen/AMDGPU/xor.ll               |   12 +-
 8 files changed, 1208 insertions(+), 953 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/anyext.ll b/llvm/test/CodeGen/AMDGPU/anyext.ll
index 8b6c8be9f37882..fade5e4327bd68 100644
--- a/llvm/test/CodeGen/AMDGPU/anyext.ll
+++ b/llvm/test/CodeGen/AMDGPU/anyext.ll
@@ -74,8 +74,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
 ; GCN-NEXT:    v_lshlrev_b32_e32 v0, 1, v1
 ; GCN-NEXT:    s_mov_b64 s[2:3], s[14:15]
 ; GCN-NEXT:    v_mov_b32_e32 v1, v3
-; GCN-NEXT:    buffer_load_ushort v2, v[2:3], s[12:15], 0 addr64
-; GCN-NEXT:    buffer_load_ushort v0, v[0:1], s[0:3], 0 addr64
+; GCN-NEXT:    buffer_load_ubyte v2, v[2:3], s[12:15], 0 addr64
+; GCN-NEXT:    buffer_load_ubyte v0, v[0:1], s[0:3], 0 addr64
 ; GCN-NEXT:    s_mov_b32 s10, -1
 ; GCN-NEXT:    s_mov_b32 s8, s4
 ; GCN-NEXT:    s_mov_b32 s9, s5
@@ -99,8 +99,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s0, v0
 ; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
-; GFX8-NEXT:    flat_load_ushort v2, v[2:3]
-; GFX8-NEXT:    flat_load_ushort v0, v[0:1]
+; GFX8-NEXT:    flat_load_ubyte v2, v[2:3]
+; GFX8-NEXT:    flat_load_ubyte v0, v[0:1]
 ; GFX8-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX8-NEXT:    s_mov_b32 s6, -1
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
@@ -118,8 +118,8 @@ define amdgpu_kernel void @s_anyext_i16_i32(ptr addrspace(1) %out, ptr addrspace
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 1, v0
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    global_load_ushort v2, v0, s[6:7]
-; GFX9-NEXT:    global_load_ushort v3, v1, s[0:1]
+; GFX9-NEXT:    global_load_ubyte v2, v0, s[6:7]
+; GFX9-NEXT:    global_load_ubyte v3, v1, s[0:1]
 ; GFX9-NEXT:    s_mov_b32 s7, 0xf000
 ; GFX9-NEXT:    s_mov_b32 s6, -1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
diff --git a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
index b5e0589cf9e466..c4ca02bcdb3dcc 100644
--- a/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
+++ b/llvm/test/CodeGen/AMDGPU/combine-cond-add-sub.ll
@@ -70,8 +70,9 @@ define i16 @add1_i16(ptr addrspace(1) nocapture %arg, ptr addrspace(1) nocapture
 ; GFX9-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX9-NEXT:    v_bfe_u32 v1, v31, 10, 10
 ; GFX9-NEXT:    v_cmp_gt_u32_e32 vcc, v2, v1
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_addc_co_u32_e32 v0, vcc, 0, v0, vcc
+; GFX9-NEXT:    v_add_u16_e32 v0, v0, v1
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
 bb:
   %x = tail call i32 @llvm.amdgcn.workitem.id.x()
diff --git a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
index 07a4478241b186..a906563c76a307 100644
--- a/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
+++ b/llvm/test/CodeGen/AMDGPU/computeNumSignBits-mul.ll
@@ -5,9 +5,9 @@ define i16 @num_sign_bits_mul_i48_0(i8 %X, i8 %Y, i8 %Z, i8 %W) {
 ; GFX9-LABEL: num_sign_bits_mul_i48_0:
 ; GFX9:       ; %bb.0:
 ; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX9-NEXT:    v_mul_i32_i24_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mul_i32_i24_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mul_i32_i24_e32 v0, v0, v2
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v2, sext(v2), sext(v3) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v0, sext(v0), sext(v1) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_0
+; GFX9-NEXT:    v_mul_lo_u16_e32 v0, v0, v2
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %A = sext i8 %X to i48
   %B = sext i8 %Y to i48
diff --git a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
index b32630a97b3ad0..9f4d4c05ed0fa4 100644
--- a/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fneg-combines.f16.ll
@@ -3063,7 +3063,7 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
+; VI-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -3078,43 +3078,45 @@ define half @v_fneg_fp_round_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    v_bfe_u32 v3, v1, 20, 11
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 0x3f1, v3
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_4)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v2, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_med3_i32 v2, v4, 0, 13
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 9, 0x7c00
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to half
   %fneg = fneg half %fpround
@@ -3210,7 +3212,7 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
+; VI-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    s_setpc_b64 s[30:31]
@@ -3225,42 +3227,43 @@ define half @v_fneg_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, 0, v0
 ; GFX11-NEXT:    v_sub_nc_u32_e32 v4, 0x3f1, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1)
+; GFX11-NEXT:    v_and_b32_e32 v1, 0xffff8000, v1
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_and_or_b32 v0, 0xffe, v2, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_med3_i32 v2, v4, 0, 13
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_or_b32_e32 v4, 0x1000, v0
 ; GFX11-NEXT:    v_cmp_ne_u32_e64 s1, 0, v0
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, v2, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_lshlrev_b32_e32 v2, v2, v5
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_ne_u32_e32 vcc_lo, v2, v4
 ; GFX11-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_or_b32_e32 v2, v5, v2
 ; GFX11-NEXT:    v_add_nc_u32_e32 v3, 0xfffffc10, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_2) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    v_lshl_or_b32 v4, v3, 12, v0
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 1, v3
 ; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v4, v2, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_lshl_or_b32 v0, v0, 9, 0x7c00
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_and_b32_e32 v4, 7, v2
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v2, 2, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v4
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_3)
 ; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v2, vcc_lo, 0, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v3
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
   %fpround = fptrunc double %fneg.a to half
@@ -3363,7 +3366,7 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v5
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v4, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, 0x8000
+; VI-NEXT:    v_mov_b32_e32 v4, 0xffff8000
 ; VI-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
@@ -3406,17 +3409,18 @@ define { half, double } @v_fneg_fp_round_store_use_fneg_f64_to_f16(double %a) #0
 ; GFX11-NEXT:    v_cmp_lt_i32_e32 vcc_lo, 5, v5
 ; GFX11-NEXT:    v_cmp_eq_u32_e64 s0, 3, v5
 ; GFX11-NEXT:    v_lshrrev_b32_e32 v5, 16, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_2) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    s_or_b32 vcc_lo, s0, vcc_lo
+; GFX11-NEXT:    v_and_b32_e32 v5, 0xffff8000, v5
 ; GFX11-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_gt_i32_e32 vcc_lo, 31, v4
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v3, 0x7c00, v3, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, v3, v2, vcc_lo
-; GFX11-NEXT:    v_and_or_b32 v3, 0x8000, v5, v2
+; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2)
+; GFX11-NEXT:    v_or_b32_e32 v3, v5, v2
 ; GFX11-NEXT:    v_xor_b32_e32 v2, 0x80000000, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v0 :: v_dual_mov_b32 v0, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
@@ -3521,7 +3525,7 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; VI-NEXT:    v_or_b32_e32 v4, 0x7c00, v4
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v6
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v5, v4, vcc
-; VI-NEXT:    v_mov_b32_e32 v4, 0x8000
+; VI-NEXT:    v_mov_b32_e32 v4, 0xffff8000
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v0, v1, v0
 ; VI-NEXT:    v_mov_b32_e32 v1, v2
@@ -3571,9 +3575,9 @@ define { half, double } @v_fneg_fp_round_multi_use_fneg_f64_to_f16(double %a, do
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v0, 0x7c00, v0, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v6
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v0, v4, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v0, v4 :: v_dual_and_b32 v1, 0xffff8000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v0, 0x8000, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v0, v1, v0
 ; GFX11-NEXT:    v_dual_mov_b32 v1, v2 :: v_dual_mov_b32 v2, v3
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fneg.a = fneg double %a
@@ -3676,7 +3680,7 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; VI-NEXT:    v_or_b32_e32 v0, 0x7c00, v0
 ; VI-NEXT:    v_cmp_eq_u32_e32 vcc, s4, v3
 ; VI-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc
-; VI-NEXT:    v_mov_b32_e32 v2, 0x8000
+; VI-NEXT:    v_mov_b32_e32 v2, 0xffff8000
 ; VI-NEXT:    v_and_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; VI-NEXT:    v_or_b32_e32 v1, v1, v0
 ; VI-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
@@ -3725,9 +3729,9 @@ define { half, half } @v_fneg_multi_use_fp_round_fneg_f64_to_f16(double %a) #0 {
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_2)
 ; GFX11-NEXT:    v_cndmask_b32_e32 v2, 0x7c00, v2, vcc_lo
 ; GFX11-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0x40f, v3
-; GFX11-NEXT:    v_cndmask_b32_e32 v0, v2, v0, vcc_lo
+; GFX11-NEXT:    v_dual_cndmask_b32 v0, v2, v0 :: v_dual_and_b32 v1, 0xffff8000, v1
 ; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_and_or_b32 v1, 0x8000, v1, v0
+; GFX11-NEXT:    v_or_b32_e32 v1, v1, v0
 ; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v1
 ; GFX11-NEXT:    s_setpc_b64 s[30:31]
   %fpround = fptrunc double %a to half
diff --git a/llvm/test/CodeGen/AMDGPU/idot8u.ll b/llvm/test/CodeGen/AMDGPU/idot8u.ll
index 779107cc40e1fb..b6f60d83620e7c 100644
--- a/llvm/test/CodeGen/AMDGPU/idot8u.ll
+++ b/llvm/test/CodeGen/AMDGPU/idot8u.ll
@@ -81,32 +81,32 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
-; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
-; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 4, 4
+; GFX8-NEXT:    v_bfe_u32 v8, v3, 8, 4
+; GFX8-NEXT:    v_bfe_u32 v10, v3, 12, 4
+; GFX8-NEXT:    v_bfe_u32 v12, v3, 16, 4
+; GFX8-NEXT:    v_bfe_u32 v14, v3, 20, 4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
-; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
-; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 4, 4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, s2
-; GFX8-NEXT:    v_mad_u32_u24 v0, v8, v15, v0
-; GFX8-NEXT:    v_mad_u32_u24 v0, v7, v14, v0
-; GFX8-NEXT:    v_mad_u32_u24 v0, v6, v13, v0
-; GFX8-NEXT:    v_mad_u32_u24 v0, v5, v12, v0
-; GFX8-NEXT:    v_mad_u32_u24 v0, v4, v11, v0
-; GFX8-NEXT:    v_mad_u32_u24 v0, v2, v10, v0
-; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v9, v0
+; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, s2
+; GFX8-NEXT:    v_bfe_u32 v9, v0, 8, 4
+; GFX8-NEXT:    v_mad_u32_u24 v4, v6, v7, v4
+; GFX8-NEXT:    v_bfe_u32 v11, v0, 12, 4
+; GFX8-NEXT:    v_mad_u32_u24 v4, v8, v9, v4
+; GFX8-NEXT:    v_bfe_u32 v13, v0, 16, 4
+; GFX8-NEXT:    v_mad_u32_u24 v4, v10, v11, v4
+; GFX8-NEXT:    v_bfe_u32 v15, v0, 20, 4
+; GFX8-NEXT:    v_mad_u32_u24 v4, v12, v13, v4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 28, v0
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 24, 4
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 24, 4
+; GFX8-NEXT:    v_mad_u32_u24 v4, v14, v15, v4
+; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v4
+; GFX8-NEXT:    v_mad_u32_u24 v2, v1, v2, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -129,36 +129,36 @@ define amdgpu_kernel void @udot8_acc32(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v4, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
-; GFX9-NEXT:    v_bfe_u32 v11, v2, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 15, v2
+; GFX9-NEXT:    v_bfe_u32 v7, v1, 4, 4
+; GFX9-NEXT:    v_bfe_u32 v8, v2, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v2
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
+; GFX9-NEXT:    v_bfe_u32 v10, v2, 8, 4
+; GFX9-NEXT:    v_bfe_u32 v11, v1, 12, 4
+; GFX9-NEXT:    v_bfe_u32 v12, v2, 12, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v1, 16, 4
+; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
+; GFX9-NEXT:    v_bfe_u32 v15, v1, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v2, v2, 24, 4
+; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v6
+; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v7, v8
+; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v9, v10
+; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v11, v12
 ; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v9, v16
-; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
-; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
+; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_add3_u32 v1, v1, s2, v2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
-; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
-; GFX9-NEXT:    v_add3_u32 v1, v1, v8, v7
-; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
-; GFX9-NEXT:    v_mul_u32_u24_e32 v3, v3, v10
-; GFX9-NEXT:    v_add3_u32 v1, v1, v6, v5
-; GFX9-NEXT:    v_add3_u32 v1, v1, v4, v3
+; GFX9-NEXT:    v_add3_u32 v3, v5, s2, v6
+; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v13, v14
+; GFX9-NEXT:    v_mul_u32_u24_e32 v10, v15, v16
+; GFX9-NEXT:    v_add3_u32 v3, v3, v7, v8
+; GFX9-NEXT:    v_add3_u32 v3, v3, v9, v10
+; GFX9-NEXT:    v_add3_u32 v1, v3, v1, v2
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -336,8 +336,8 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -350,37 +350,46 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT:    v_and_b32_sdwa v7, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
+; GFX8-NEXT:    v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
+; GFX8-NEXT:    v_and_b32_sdwa v14, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT:    v_and_b32_e32 v18, 15, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
+; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -389,44 +398,53 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ushort v3, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT:    v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-NEXT:    v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -435,44 +453,53 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-DL-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -482,7 +509,6 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
@@ -491,37 +517,47 @@ define amdgpu_kernel void @udot8_acc16(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_ushort v4, v1, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v7, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 12, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 15
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v8
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v9, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 20, v2
+; GFX10-DL-NEXT:    v_and_b32_sdwa v10, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v7
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v9
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mad_u16 v3, v8, v10, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
-; GFX10-DL-NEXT:    global_store_short v1, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v4, v3
+; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT:    global_store_short v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                        ptr addrspace(1) %src2,
                                        ptr addrspace(1) nocapture %dst) {
@@ -654,8 +690,8 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -668,37 +704,46 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT:    v_and_b32_sdwa v7, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
+; GFX8-NEXT:    v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
+; GFX8-NEXT:    v_and_b32_sdwa v14, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT:    v_and_b32_e32 v18, 15, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
+; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -707,44 +752,53 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT:    v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-NEXT:    v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -753,44 +807,53 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v6, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v13, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-DL-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -800,7 +863,6 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
@@ -809,37 +871,47 @@ define amdgpu_kernel void @udot8_acc8(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 24, 4
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v7, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 12, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 15
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v8
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v9, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 20, v2
+; GFX10-DL-NEXT:    v_and_b32_sdwa v10, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v7
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v9
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mad_u16 v3, v8, v10, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
-; GFX10-DL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v4, v3
+; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
@@ -973,8 +1045,8 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -987,37 +1059,44 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
+; GFX8-NEXT:    v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
-; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT:    v_and_b32_e32 v18, 15, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
+; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
+; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -1027,44 +1106,51 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1074,44 +1160,51 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-DL-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
@@ -1122,7 +1215,6 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
@@ -1131,38 +1223,46 @@ define amdgpu_kernel void @udot8_acc4(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v7, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 12, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 15
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v8
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v9, v3
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v2
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v4, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX10-DL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
+; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                       ptr addrspace(1) %src2,
                                       ptr addrspace(1) nocapture %dst) {
@@ -1280,8 +1380,8 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -1294,37 +1394,44 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
+; GFX8-NEXT:    v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
-; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT:    v_and_b32_e32 v18, 15, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
+; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
+; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -1334,44 +1441,51 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
@@ -1381,44 +1495,51 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 28, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 24, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 20, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 4, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 24, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 4, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 20, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 4, v2
 ; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v17
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v1, v2, v3
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v17, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v16, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v11, v17, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX9-DL-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v10, v16, v1
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v9, v15, v1
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v8, v4, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v7, v14, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v6, v13, v1
 ; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v5, v12, v1
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v1, v4, v11, v1
 ; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
 ; GFX9-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
@@ -1429,7 +1550,6 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
-; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
@@ -1438,38 +1558,46 @@ define amdgpu_kernel void @udot8_CommutationInsideMAD(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX10-DL-NEXT:    s_clause 0x1
-; GFX10-DL-NEXT:    global_load_dword v2, v0, s[4:5]
-; GFX10-DL-NEXT:    global_load_dword v3, v0, s[6:7]
-; GFX10-DL-NEXT:    global_load_ubyte v4, v1, s[0:1]
+; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
+; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
+; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 4, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 4, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v4
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u16 v0, v0, v5, v4
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 8, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v3, 16, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 24, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v7, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 12, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v6, v3
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 15
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v8
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v9, v3
+; GFX10-DL-NEXT:    v_and_b32_sdwa v7, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 20, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v2
+; GFX10-DL-NEXT:    v_mad_u16 v3, v7, v4, v3
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 24, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 24, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
-; GFX10-DL-NEXT:    v_mad_u16 v0, v6, v7, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v4, v5, v0
-; GFX10-DL-NEXT:    v_mad_u16 v0, v2, v3, v0
-; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v0
-; GFX10-DL-NEXT:    global_store_byte v1, v0, s[0:1]
+; GFX10-DL-NEXT:    v_mad_u16 v3, v5, v6, v3
+; GFX10-DL-NEXT:    v_mad_u16 v3, v4, v7, v3
+; GFX10-DL-NEXT:    v_mad_u16 v1, v1, v2, v3
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    global_store_byte v0, v1, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                                       ptr addrspace(1) %src2,
                                                       ptr addrspace(1) nocapture %dst) {
@@ -1603,34 +1731,34 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
-; GFX8-NEXT:    v_bfe_u32 v2, v3, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v4, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 4, 4
-; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
+; GFX8-NEXT:    v_and_b32_e32 v4, 15, v3
+; GFX8-NEXT:    v_bfe_u32 v6, v3, 4, 4
+; GFX8-NEXT:    v_bfe_u32 v8, v3, 8, 4
+; GFX8-NEXT:    v_bfe_u32 v10, v3, 12, 4
+; GFX8-NEXT:    v_bfe_u32 v12, v3, 16, 4
+; GFX8-NEXT:    v_bfe_u32 v14, v3, 20, 4
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v0
-; GFX8-NEXT:    v_bfe_u32 v10, v0, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v0, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v12, v0, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v13, v0, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v14, v0, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v0, 4, 4
-; GFX8-NEXT:    v_and_b32_e32 v0, 15, v0
+; GFX8-NEXT:    v_and_b32_e32 v5, 15, v0
+; GFX8-NEXT:    v_bfe_u32 v7, v0, 4, 4
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX8-NEXT:    v_mad_u32_u24 v16, v3, v0, s2
-; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v16
-; GFX8-NEXT:    v_mad_u32_u24 v3, v8, v15, v16
-; GFX8-NEXT:    v_mad_u32_u24 v3, v7, v14, v3
-; GFX8-NEXT:    v_mad_u32_u24 v3, v6, v13, v3
-; GFX8-NEXT:    v_mad_u32_u24 v3, v5, v12, v3
-; GFX8-NEXT:    v_mad_u32_u24 v3, v4, v11, v3
-; GFX8-NEXT:    v_mad_u32_u24 v2, v2, v10, v3
-; GFX8-NEXT:    v_mad_u32_u24 v1, v1, v9, v2
-; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v0, v1
+; GFX8-NEXT:    v_mad_u32_u24 v16, v4, v5, s2
+; GFX8-NEXT:    v_bfe_u32 v9, v0, 8, 4
+; GFX8-NEXT:    v_mad_u32_u24 v4, v4, v5, v16
+; GFX8-NEXT:    v_mad_u32_u24 v5, v6, v7, v16
+; GFX8-NEXT:    v_bfe_u32 v11, v0, 12, 4
+; GFX8-NEXT:    v_mad_u32_u24 v5, v8, v9, v5
+; GFX8-NEXT:    v_bfe_u32 v13, v0, 16, 4
+; GFX8-NEXT:    v_mad_u32_u24 v5, v10, v11, v5
+; GFX8-NEXT:    v_bfe_u32 v15, v0, 20, 4
+; GFX8-NEXT:    v_mad_u32_u24 v5, v12, v13, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v1, 28, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 28, v0
+; GFX8-NEXT:    v_bfe_u32 v3, v3, 24, 4
+; GFX8-NEXT:    v_bfe_u32 v0, v0, 24, 4
+; GFX8-NEXT:    v_mad_u32_u24 v5, v14, v15, v5
+; GFX8-NEXT:    v_mad_u32_u24 v0, v3, v0, v5
+; GFX8-NEXT:    v_mad_u32_u24 v0, v1, v2, v0
+; GFX8-NEXT:    v_add_u32_e32 v2, vcc, v4, v0
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_store_dword v[0:1], v2
@@ -1653,37 +1781,37 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfe_u32 v3, v1, 4, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_bfe_u32 v10, v2, 4, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 20, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v4, 28, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-NEXT:    v_and_b32_e32 v6, 15, v2
+; GFX9-NEXT:    v_bfe_u32 v7, v1, 4, 4
+; GFX9-NEXT:    v_bfe_u32 v8, v2, 4, 4
+; GFX9-NEXT:    v_bfe_u32 v9, v1, 8, 4
+; GFX9-NEXT:    v_bfe_u32 v10, v2, 8, 4
+; GFX9-NEXT:    v_bfe_u32 v11, v1, 12, 4
+; GFX9-NEXT:    v_bfe_u32 v12, v2, 12, 4
+; GFX9-NEXT:    v_bfe_u32 v13, v1, 16, 4
 ; GFX9-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
+; GFX9-NEXT:    v_bfe_u32 v15, v1, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
+; GFX9-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-NEXT:    v_bfe_u32 v2, v2, 24, 4
+; GFX9-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX9-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
-; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
-; GFX9-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
-; GFX9-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
-; GFX9-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
-; GFX9-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
-; GFX9-NEXT:    v_add3_u32 v2, v2, v9, v8
-; GFX9-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
-; GFX9-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
-; GFX9-NEXT:    v_add3_u32 v2, v2, v7, v6
-; GFX9-NEXT:    v_add3_u32 v2, v2, v5, v4
-; GFX9-NEXT:    v_add3_u32 v1, v17, v1, v2
+; GFX9-NEXT:    v_mad_u32_u24 v3, v5, v6, s2
+; GFX9-NEXT:    v_mul_u32_u24_e32 v9, v9, v10
+; GFX9-NEXT:    v_mul_u32_u24_e32 v10, v11, v12
+; GFX9-NEXT:    v_mad_u32_u24 v4, v7, v8, v3
+; GFX9-NEXT:    v_mul_u32_u24_e32 v11, v13, v14
+; GFX9-NEXT:    v_mul_u32_u24_e32 v12, v15, v16
+; GFX9-NEXT:    v_add3_u32 v4, v4, v9, v10
+; GFX9-NEXT:    v_add3_u32 v4, v4, v11, v12
+; GFX9-NEXT:    v_mul_u32_u24_e32 v17, v5, v6
+; GFX9-NEXT:    v_add3_u32 v1, v4, v1, v2
+; GFX9-NEXT:    v_add3_u32 v1, v17, v3, v1
 ; GFX9-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -1704,37 +1832,37 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_u32 v3, v1, 4, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 28, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 20, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v4, 28, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 15, v2
+; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 4, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v8, v2, 4, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 8, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v10, v2, 8, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v11, v1, 12, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 12, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v13, v1, 16, 4
 ; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 8, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v2, 15, v2
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v17, v1, v2
+; GFX9-DL-NEXT:    v_bfe_u32 v15, v1, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 20, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
+; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v4
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX9-DL-NEXT:    v_mad_u32_u24 v1, v1, v2, s2
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v9, v9, v16
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v8, v8, v15
-; GFX9-DL-NEXT:    v_mad_u32_u24 v2, v3, v10, v1
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v14
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v13
-; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v9, v8
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v12
-; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v4, v4, v11
-; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v7, v6
-; GFX9-DL-NEXT:    v_add3_u32 v2, v2, v5, v4
-; GFX9-DL-NEXT:    v_add3_u32 v1, v17, v1, v2
+; GFX9-DL-NEXT:    v_mad_u32_u24 v3, v5, v6, s2
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v9, v9, v10
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v10, v11, v12
+; GFX9-DL-NEXT:    v_mad_u32_u24 v4, v7, v8, v3
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v11, v13, v14
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v12, v15, v16
+; GFX9-DL-NEXT:    v_add3_u32 v4, v4, v9, v10
+; GFX9-DL-NEXT:    v_add3_u32 v4, v4, v11, v12
+; GFX9-DL-NEXT:    v_mul_u32_u24_e32 v17, v5, v6
+; GFX9-DL-NEXT:    v_add3_u32 v1, v4, v1, v2
+; GFX9-DL-NEXT:    v_add3_u32 v1, v17, v3, v1
 ; GFX9-DL-NEXT:    global_store_dword v0, v1, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -1756,38 +1884,38 @@ define amdgpu_kernel void @udot8_multiuses_mul1(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    s_load_dword s2, s[0:1], 0x0
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v8, 15, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v2
-; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v1
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v1, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v12, v2, 12, 4
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
+; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 8, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 12, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v11, v2, 12, 4
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v8
 ; GFX10-DL-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX10-DL-NEXT:    v_mad_u32_u24 v13, v8, v9, s2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v8, v4, v5, s2
+; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 16, 4
+; GFX10-DL-NEXT:    v_bfe_u32 v13, v2, 16, 4
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v10, v10, v11
+; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 20, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v14, v2, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v11
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v7, v7, v12
-; GFX10-DL-NEXT:    v_mad_u32_u24 v0, v0, v10, v13
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v2
+; GFX10-DL-NEXT:    v_mad_u32_u24 v6, v6, v9, v8
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v0, 28, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 28, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v9, v12, v13
+; GFX10-DL-NEXT:    v_bfe_u32 v1, v1, 24, 4
 ; GFX10-DL-NEXT:    v_bfe_u32 v2, v2, 24, 4
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v6, v6, v15
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v5, v5, v14
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v7
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v4, v2
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v2, v3, v10
-; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v3, v8, v9
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v6, v5
-; GFX10-DL-NEXT:    v_add3_u32 v0, v0, v1, v2
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v11, v11, v14
+; GFX10-DL-NEXT:    v_add3_u32 v6, v6, v7, v10
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v0, v0, v3
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v3, v4, v5
+; GFX10-DL-NEXT:    v_mul_u32_u24_e32 v1, v1, v2
+; GFX10-DL-NEXT:    v_add3_u32 v2, v6, v9, v11
+; GFX10-DL-NEXT:    v_add3_u32 v0, v2, v1, v0
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v1, 0
-; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v13, v0
+; GFX10-DL-NEXT:    v_add3_u32 v0, v3, v8, v0
 ; GFX10-DL-NEXT:    global_store_dword v1, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
                                                 ptr addrspace(1) %src2,
@@ -2158,8 +2286,8 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -2172,37 +2300,46 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ushort v4, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT:    v_bfe_u32 v6, v3, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT:    v_and_b32_sdwa v7, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
+; GFX8-NEXT:    v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT:    v_bfe_u32 v13, v2, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
+; GFX8-NEXT:    v_and_b32_sdwa v14, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT:    v_and_b32_e32 v18, 15, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
+; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
 ; GFX8-NEXT:    flat_store_short v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
 ;
@@ -2211,53 +2348,62 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-NEXT:    s_mov_b32 s2, 0x5040100
-; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ushort v3, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v4, 15, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v11, 15, v2
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s2
-; GFX9-NEXT:    v_perm_b32 v7, v12, v11, s2
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s2
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
-; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s2
-; GFX9-NEXT:    v_perm_b32 v9, v14, v13, s2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 24, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT:    v_and_b32_e32 v13, 15, v13
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v1
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 24, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 20, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
-; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_perm_b32 v2, v2, v17, s2
-; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s2
-; GFX9-NEXT:    v_perm_b32 v10, v16, v15, s2
-; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s2
+; GFX9-NEXT:    v_perm_b32 v8, v13, v12, s2
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s2
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s2
+; GFX9-NEXT:    v_perm_b32 v1, v1, v11, s2
+; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v5, v8
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
+; GFX9-NEXT:    v_perm_b32 v2, v17, v16, s2
+; GFX9-NEXT:    v_perm_b32 v4, v10, v9, s2
+; GFX9-NEXT:    v_perm_b32 v9, v15, v14, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u16_e32 v3, v5, v3
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v4, v2
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v7, v9
 ; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u16_e32 v3, v3, v4
+; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
@@ -2270,53 +2416,62 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0x5040100
-; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, 15, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-DL-NEXT:    v_perm_b32 v6, v7, v6, s2
-; GFX9-DL-NEXT:    v_perm_b32 v7, v12, v11, s2
-; GFX9-DL-NEXT:    v_perm_b32 v4, v5, v4, s2
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
-; GFX9-DL-NEXT:    v_perm_b32 v8, v9, v8, s2
-; GFX9-DL-NEXT:    v_perm_b32 v9, v14, v13, s2
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 24, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX9-DL-NEXT:    v_and_b32_e32 v13, 15, v13
+; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 24, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 20, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
-; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v17, s2
-; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v10, s2
-; GFX9-DL-NEXT:    v_perm_b32 v10, v16, v15, s2
-; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
+; GFX9-DL-NEXT:    v_perm_b32 v7, v8, v7, s2
+; GFX9-DL-NEXT:    v_perm_b32 v8, v13, v12, s2
+; GFX9-DL-NEXT:    v_perm_b32 v5, v6, v5, s2
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-DL-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v4, s2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v11, s2
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v8
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
+; GFX9-DL-NEXT:    v_perm_b32 v2, v17, v16, s2
+; GFX9-DL-NEXT:    v_perm_b32 v4, v10, v9, s2
+; GFX9-DL-NEXT:    v_perm_b32 v9, v15, v14, s2
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_add_u16_e32 v3, v5, v3
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v2
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v7, v9
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v4
+; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
@@ -2330,6 +2485,7 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
@@ -2343,45 +2499,53 @@ define amdgpu_kernel void @udot8_acc16_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ushort v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 4, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
-; GFX10-DL-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
-; GFX10-DL-NEXT:    v_perm_b32 v4, v7, v4, 0x5040100
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 8, 4
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_perm_b32 v6, v9, v6, 0x5040100
-; GFX10-DL-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 20, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
+; GFX10-DL-NEXT:    v_perm_b32 v5, v7, v5, 0x5040100
+; GFX10-DL-NEXT:    v_perm_b32 v6, v6, v10, 0x5040100
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, 15, v12
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v13
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
+; GFX10-DL-NEXT:    v_perm_b32 v8, v10, v8, 0x5040100
+; GFX10-DL-NEXT:    v_perm_b32 v7, v9, v7, 0x5040100
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v7
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
+; GFX10-DL-NEXT:    v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v12
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v7, v8
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_perm_b32 v5, v10, v5, 0x5040100
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 24, 4
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NEXT:    v_perm_b32 v4, v9, v4, 0x5040100
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 24, 4
+; GFX10-DL-NEXT:    v_perm_b32 v6, v6, v11, 0x5040100
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
-; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v6, 0x5040100
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v8, 0x5040100
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v5
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v4
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
@@ -2482,8 +2646,8 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -2496,56 +2660,65 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 28, v3
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v12, v3, 16, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 4, v3
+; GFX8-NEXT:    v_and_b32_e32 v7, 15, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 20, v3
+; GFX8-NEXT:    v_and_b32_sdwa v11, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v12, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 28, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 28, v2
-; GFX8-NEXT:    v_bfe_u32 v17, v2, 24, 4
-; GFX8-NEXT:    v_bfe_u32 v18, v2, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v2, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v19, v2, 16, 4
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v11, v11, v18 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_mul_lo_u16_e32 v18, v10, v17
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v9, v9, v16 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_bfe_u32 v5, v3, 4, 4
-; GFX8-NEXT:    v_and_b32_e32 v6, 15, v3
-; GFX8-NEXT:    v_bfe_u32 v3, v2, 4, 4
-; GFX8-NEXT:    v_and_b32_e32 v13, 15, v2
-; GFX8-NEXT:    v_mul_lo_u16_e32 v2, v12, v19
-; GFX8-NEXT:    v_mul_lo_u16_e32 v8, v8, v15
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v7, v7, v14 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v9, v18, v9
-; GFX8-NEXT:    v_mul_lo_u16_sdwa v5, v5, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v3, v2, v11
-; GFX8-NEXT:    v_or_b32_e32 v7, v8, v7
-; GFX8-NEXT:    v_lshlrev_b32_e32 v8, 16, v9
-; GFX8-NEXT:    v_mul_lo_u16_e32 v6, v6, v13
-; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v7
-; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX8-NEXT:    v_or_b32_e32 v6, v6, v5
-; GFX8-NEXT:    v_or_b32_e32 v5, v5, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
+; GFX8-NEXT:    v_and_b32_e32 v14, 15, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 20, v2
+; GFX8-NEXT:    v_and_b32_sdwa v18, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
+; GFX8-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX8-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX8-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX8-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_lo_u16_e32 v3, v12, v5
+; GFX8-NEXT:    v_mul_lo_u16_e32 v19, v11, v18
+; GFX8-NEXT:    v_or_b32_e32 v2, v3, v2
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v3, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_mul_lo_u16_e32 v9, v9, v16
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX8-NEXT:    v_and_b32_e32 v13, 15, v13
+; GFX8-NEXT:    v_lshlrev_b32_e32 v10, 16, v2
+; GFX8-NEXT:    v_or_b32_e32 v3, v19, v3
+; GFX8-NEXT:    v_or_b32_e32 v8, v9, v8
+; GFX8-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
+; GFX8-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v2
+; GFX8-NEXT:    v_or_b32_sdwa v3, v3, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 16, v8
+; GFX8-NEXT:    v_or_b32_e32 v7, v7, v6
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 8, v3
+; GFX8-NEXT:    v_or_b32_e32 v6, v6, v2
 ; GFX8-NEXT:    v_lshrrev_b64 v[2:3], 24, v[2:3]
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 8, v5
+; GFX8-NEXT:    v_lshrrev_b32_e32 v3, 8, v6
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
-; GFX8-NEXT:    v_add_u16_e32 v3, v6, v4
-; GFX8-NEXT:    v_add_u16_e32 v3, v3, v5
-; GFX8-NEXT:    v_add_u16_e32 v3, v3, v7
+; GFX8-NEXT:    v_add_u16_e32 v4, v7, v4
+; GFX8-NEXT:    v_add_u16_e32 v3, v4, v3
+; GFX8-NEXT:    v_add_u16_e32 v3, v3, v8
 ; GFX8-NEXT:    v_add_u16_e32 v2, v3, v2
-; GFX8-NEXT:    v_mad_u16 v2, v12, v19, v2
-; GFX8-NEXT:    v_add_u16_e32 v2, v2, v8
-; GFX8-NEXT:    v_lshrrev_b32_e32 v9, 8, v9
-; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
+; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
+; GFX8-NEXT:    v_add_u16_e32 v2, v2, v10
+; GFX8-NEXT:    v_mad_u16 v2, v12, v5, v2
 ; GFX8-NEXT:    v_add_u16_e32 v2, v2, v9
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -2561,57 +2734,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    global_load_ubyte v4, v3, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_bfe_u32 v0, v1, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 8, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 20, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_bfe_u32 v1, v2, 4, 4
-; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 12, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 8, 4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 24, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v2, v2, 16, 4
-; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 12, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v18, 20, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 15, v1
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX9-NEXT:    v_and_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 4, v2
+; GFX9-NEXT:    v_and_b32_e32 v13, 15, v2
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
+; GFX9-NEXT:    v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_and_b32_e32 v0, 15, v5
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v7
+; GFX9-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX9-NEXT:    v_and_b32_e32 v8, 15, v11
+; GFX9-NEXT:    v_and_b32_e32 v11, 15, v14
+; GFX9-NEXT:    v_and_b32_e32 v14, 15, v15
+; GFX9-NEXT:    v_and_b32_e32 v15, 15, v18
+; GFX9-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-NEXT:    v_mul_lo_u16_e32 v18, v12, v2
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
 ; GFX9-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v0, v18, v10
-; GFX9-NEXT:    v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX9-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-NEXT:    v_or_b32_e32 v5, v5, v12
-; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-NEXT:    v_or_b32_e32 v10, v12, v0
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v5, v5, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_e32 v6, v6, v13
+; GFX9-NEXT:    v_or_b32_sdwa v13, v17, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v11, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_e32 v0, v18, v8
+; GFX9-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX9-NEXT:    v_or_b32_sdwa v1, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX9-NEXT:    v_or_b32_e32 v6, v6, v11
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_or_b32_e32 v8, v11, v0
 ; GFX9-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v4, v5, v4
+; GFX9-NEXT:    v_add_u16_e32 v4, v6, v4
 ; GFX9-NEXT:    v_add_u16_e32 v1, v4, v1
-; GFX9-NEXT:    v_add_u16_e32 v1, v1, v6
+; GFX9-NEXT:    v_add_u16_e32 v1, v1, v5
 ; GFX9-NEXT:    v_add_u16_e32 v0, v1, v0
-; GFX9-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
-; GFX9-NEXT:    v_add_u16_e32 v0, v0, v8
-; GFX9-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v12, v2, v0
 ; GFX9-NEXT:    v_add_u16_e32 v0, v0, v7
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; GFX9-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-NEXT:    v_add_u16_e32 v0, v0, v10
 ; GFX9-NEXT:    global_store_byte v3, v0, s[0:1]
 ; GFX9-NEXT:    s_endpgm
 ;
@@ -2626,57 +2808,66 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    global_load_ubyte v4, v3, s[0:1]
+; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v11, v1, 16, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_bfe_u32 v1, v2, 4, 4
-; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 12, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 8, 4
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 28, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v2, v2, 16, 4
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v11, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 12, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v18, 20, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 15, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 4, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v13, 15, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_and_b32_e32 v0, 15, v5
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX9-DL-NEXT:    v_and_b32_e32 v8, 15, v11
+; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v14
+; GFX9-DL-NEXT:    v_and_b32_e32 v14, 15, v15
+; GFX9-DL-NEXT:    v_and_b32_e32 v15, 15, v18
+; GFX9-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v18, v12, v2
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v10, v10, v17 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v17, v9, v16
 ; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v8, v8, v15 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v7, v7, v14
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v6, v6, v13 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v5, v5, v12
-; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v12, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v0, v18, v10
-; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v17, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-DL-NEXT:    v_or_b32_e32 v6, v7, v6
-; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX9-DL-NEXT:    v_or_b32_e32 v5, v5, v12
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v8
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
-; GFX9-DL-NEXT:    v_or_b32_e32 v10, v12, v0
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v5, v5, v11 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_e32 v6, v6, v13
+; GFX9-DL-NEXT:    v_or_b32_sdwa v13, v17, v10 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_mul_lo_u16_sdwa v11, v0, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-DL-NEXT:    v_or_b32_e32 v0, v18, v8
+; GFX9-DL-NEXT:    v_or_b32_e32 v5, v7, v5
+; GFX9-DL-NEXT:    v_or_b32_sdwa v1, v0, v13 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v5
+; GFX9-DL-NEXT:    v_or_b32_e32 v6, v6, v11
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-DL-NEXT:    v_or_b32_e32 v8, v11, v0
 ; GFX9-DL-NEXT:    v_lshrrev_b64 v[0:1], 24, v[0:1]
-; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v10
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u16_e32 v4, v5, v4
+; GFX9-DL-NEXT:    v_add_u16_e32 v4, v6, v4
 ; GFX9-DL-NEXT:    v_add_u16_e32 v1, v4, v1
-; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v6
+; GFX9-DL-NEXT:    v_add_u16_e32 v1, v1, v5
 ; GFX9-DL-NEXT:    v_add_u16_e32 v0, v1, v0
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v11, v2, v0
-; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v8
-; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v12, v2, v0
 ; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v7
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 8, v10
+; GFX9-DL-NEXT:    v_mad_legacy_u16 v0, v9, v16, v0
+; GFX9-DL-NEXT:    v_add_u16_e32 v0, v0, v10
 ; GFX9-DL-NEXT:    global_store_byte v3, v0, s[0:1]
 ; GFX9-DL-NEXT:    s_endpgm
 ;
@@ -2698,55 +2889,64 @@ define amdgpu_kernel void @udot8_acc8_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX10-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX10-DL-NEXT:    global_load_ubyte v3, v4, s[0:1]
+; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 15
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 12, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 4, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 12, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v9, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v10, 28, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v11, 20, v1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v12, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v0, v1, 4, 4
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 8, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 28, v1
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 24, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v11, v1, 20, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v12, v1, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v1, v2, 8, 4
-; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v9
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 4, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v14, 28, v2
-; GFX10-DL-NEXT:    v_bfe_u32 v15, v2, 20, 4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 12, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v15, 8, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX10-DL-NEXT:    v_and_b32_e32 v1, 15, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v5
+; GFX10-DL-NEXT:    v_and_b32_e32 v13, 15, v13
+; GFX10-DL-NEXT:    v_and_b32_e32 v14, 15, v2
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v7, v1
-; GFX10-DL-NEXT:    v_lshlrev_b16 v6, 8, v6
-; GFX10-DL-NEXT:    v_and_b32_e32 v13, 15, v2
-; GFX10-DL-NEXT:    v_mul_lo_u16 v0, v0, v9
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v16, v2, 24, 4
-; GFX10-DL-NEXT:    v_or_b32_e32 v6, v1, v6
-; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v11, v15
-; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v8, v14
-; GFX10-DL-NEXT:    v_lshlrev_b16 v9, 8, v0
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 20, v2
+; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v8, v15
+; GFX10-DL-NEXT:    v_and_b32_sdwa v16, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v17, 28, v2
+; GFX10-DL-NEXT:    v_lshlrev_b16 v1, 8, v1
+; GFX10-DL-NEXT:    v_and_b32_sdwa v15, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_e32 v0, 15, v11
+; GFX10-DL-NEXT:    v_and_b32_e32 v2, 15, v7
 ; GFX10-DL-NEXT:    v_mul_lo_u16 v5, v5, v13
-; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v6
-; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v12, v7
-; GFX10-DL-NEXT:    v_mul_lo_u16 v11, v10, v16
-; GFX10-DL-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX10-DL-NEXT:    v_or_b32_e32 v7, v8, v1
+; GFX10-DL-NEXT:    v_mul_lo_u16 v8, v10, v17
+; GFX10-DL-NEXT:    v_mul_lo_u16 v6, v6, v14
+; GFX10-DL-NEXT:    v_mul_lo_u16 v2, v0, v2
+; GFX10-DL-NEXT:    v_lshlrev_b16 v5, 8, v5
+; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 16, v7
+; GFX10-DL-NEXT:    v_mul_lo_u16 v1, v12, v15
+; GFX10-DL-NEXT:    v_mul_lo_u16 v10, v9, v16
 ; GFX10-DL-NEXT:    v_lshlrev_b16 v8, 8, v8
-; GFX10-DL-NEXT:    v_or_b32_sdwa v13, v9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_or_b32_e32 v5, v5, v9
+; GFX10-DL-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX10-DL-NEXT:    v_or_b32_sdwa v11, v5, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_or_b32_e32 v5, v6, v5
+; GFX10-DL-NEXT:    v_or_b32_sdwa v6, v10, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_or_b32_e32 v1, v1, v2
-; GFX10-DL-NEXT:    v_or_b32_sdwa v2, v11, v8 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 8, v13
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 8, v11
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
-; GFX10-DL-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-DL-NEXT:    v_add_nc_u16 v5, v3, v9
+; GFX10-DL-NEXT:    v_or_b32_sdwa v1, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_add_nc_u16 v5, v3, v2
 ; GFX10-DL-NEXT:    v_lshrrev_b64 v[2:3], 24, v[0:1]
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v1
-; GFX10-DL-NEXT:    v_add_nc_u16 v0, v5, v6
+; GFX10-DL-NEXT:    v_add_nc_u16 v0, v5, v7
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v2
-; GFX10-DL-NEXT:    v_mad_u16 v0, v12, v7, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v12, v15, v0
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 8, v8
-; GFX10-DL-NEXT:    v_mad_u16 v0, v10, v16, v0
+; GFX10-DL-NEXT:    v_mad_u16 v0, v9, v16, v0
 ; GFX10-DL-NEXT:    v_add_nc_u16 v0, v0, v1
 ; GFX10-DL-NEXT:    global_store_byte v4, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_endpgm
@@ -2846,8 +3046,8 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX8-NEXT:    v_lshlrev_b32_e32 v2, 2, v0
+; GFX8-NEXT:    v_mov_b32_e32 v5, 15
 ; GFX8-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
-; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s5
 ; GFX8-NEXT:    v_add_u32_e32 v0, vcc, s4, v2
@@ -2860,37 +3060,44 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
 ; GFX8-NEXT:    v_mov_b32_e32 v0, s0
 ; GFX8-NEXT:    v_mov_b32_e32 v1, s1
 ; GFX8-NEXT:    flat_load_ubyte v4, v[0:1]
+; GFX8-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX8-NEXT:    s_mov_b32 s14, -1
 ; GFX8-NEXT:    s_mov_b32 s15, 0xe80000
 ; GFX8-NEXT:    s_add_u32 s12, s12, s9
 ; GFX8-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX8-NEXT:    s_waitcnt vmcnt(2)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v5, 28, v3
-; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 24, v3
-; GFX8-NEXT:    v_bfe_u32 v7, v3, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v8, v3, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v9, v3, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v10, v3, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v11, v3, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v6, 28, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v7, 24, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v8, 20, v3
+; GFX8-NEXT:    v_and_b32_sdwa v9, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v10, 12, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v11, 8, v3
+; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 4, v3
 ; GFX8-NEXT:    v_and_b32_e32 v3, 15, v3
 ; GFX8-NEXT:    s_waitcnt vmcnt(1)
-; GFX8-NEXT:    v_lshrrev_b32_e32 v12, 28, v2
-; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 24, v2
-; GFX8-NEXT:    v_bfe_u32 v14, v2, 20, 4
-; GFX8-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX8-NEXT:    v_bfe_u32 v16, v2, 12, 4
-; GFX8-NEXT:    v_bfe_u32 v17, v2, 8, 4
-; GFX8-NEXT:    v_bfe_u32 v18, v2, 4, 4
+; GFX8-NEXT:    v_lshrrev_b32_e32 v13, 28, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v14, 24, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v15, 20, v2
+; GFX8-NEXT:    v_and_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX8-NEXT:    v_lshrrev_b32_e32 v16, 12, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v17, 8, v2
+; GFX8-NEXT:    v_lshrrev_b32_e32 v18, 4, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
+; GFX8-NEXT:    v_and_b32_e32 v12, 15, v12
+; GFX8-NEXT:    v_and_b32_e32 v18, 15, v18
 ; GFX8-NEXT:    s_waitcnt vmcnt(0)
 ; GFX8-NEXT:    v_mad_u16 v2, v3, v2, v4
-; GFX8-NEXT:    v_mad_u16 v2, v11, v18, v2
-; GFX8-NEXT:    v_mad_u16 v2, v10, v17, v2
-; GFX8-NEXT:    v_mad_u16 v2, v9, v16, v2
+; GFX8-NEXT:    v_and_b32_e32 v11, 15, v11
+; GFX8-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX8-NEXT:    v_mad_u16 v2, v12, v18, v2
+; GFX8-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX8-NEXT:    v_and_b32_e32 v16, 15, v16
+; GFX8-NEXT:    v_mad_u16 v2, v11, v17, v2
+; GFX8-NEXT:    v_mad_u16 v2, v10, v16, v2
+; GFX8-NEXT:    v_mad_u16 v2, v9, v5, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v8, v15, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v7, v14, v2
 ; GFX8-NEXT:    v_mad_u16 v2, v6, v13, v2
-; GFX8-NEXT:    v_mad_u16 v2, v5, v12, v2
 ; GFX8-NEXT:    v_and_b32_e32 v2, 15, v2
 ; GFX8-NEXT:    flat_store_byte v[0:1], v2
 ; GFX8-NEXT:    s_endpgm
@@ -2900,53 +3107,62 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
 ; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-NEXT:    s_mov_b32 s2, 0x5040100
-; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-NEXT:    s_mov_b32 s14, -1
 ; GFX9-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-NEXT:    v_and_b32_e32 v4, 15, v1
-; GFX9-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-NEXT:    v_bfe_u32 v6, v1, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v7, v1, 12, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v6, 4, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_and_b32_e32 v11, 15, v2
-; GFX9-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-NEXT:    v_perm_b32 v6, v7, v6, s2
-; GFX9-NEXT:    v_perm_b32 v7, v12, v11, s2
-; GFX9-NEXT:    v_perm_b32 v4, v5, v4, s2
-; GFX9-NEXT:    v_bfe_u32 v8, v1, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v13, v2, 8, 4
-; GFX9-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
-; GFX9-NEXT:    v_perm_b32 v8, v9, v8, s2
-; GFX9-NEXT:    v_perm_b32 v9, v14, v13, s2
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-NEXT:    v_bfe_u32 v10, v1, 24, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
+; GFX9-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX9-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX9-NEXT:    v_and_b32_e32 v13, 15, v13
+; GFX9-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 20, v1
+; GFX9-NEXT:    v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX9-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-NEXT:    v_bfe_u32 v17, v2, 24, 4
+; GFX9-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX9-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-NEXT:    v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 20, v2
+; GFX9-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
-; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-NEXT:    v_perm_b32 v2, v2, v17, s2
-; GFX9-NEXT:    v_perm_b32 v1, v1, v10, s2
-; GFX9-NEXT:    v_perm_b32 v10, v16, v15, s2
-; GFX9-NEXT:    v_add_u16_e32 v3, v3, v5
+; GFX9-NEXT:    v_perm_b32 v7, v8, v7, s2
+; GFX9-NEXT:    v_perm_b32 v8, v13, v12, s2
+; GFX9-NEXT:    v_perm_b32 v5, v6, v5, s2
+; GFX9-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX9-NEXT:    v_perm_b32 v2, v2, v4, s2
+; GFX9-NEXT:    v_perm_b32 v1, v1, v11, s2
+; GFX9-NEXT:    v_pk_mul_lo_u16 v5, v5, v8
 ; GFX9-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
+; GFX9-NEXT:    v_perm_b32 v2, v17, v16, s2
+; GFX9-NEXT:    v_perm_b32 v4, v10, v9, s2
+; GFX9-NEXT:    v_perm_b32 v9, v15, v14, s2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_add_u16_e32 v3, v5, v3
+; GFX9-NEXT:    v_pk_mul_lo_u16 v2, v4, v2
+; GFX9-NEXT:    v_pk_mul_lo_u16 v4, v7, v9
 ; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_add_u16_e32 v3, v3, v4
+; GFX9-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-NEXT:    v_add_u16_e32 v2, v2, v1
@@ -2960,53 +3176,62 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
 ; GFX9-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX9-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX9-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX9-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX9-DL-NEXT:    s_mov_b32 s2, 0x5040100
-; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_waitcnt lgkmcnt(0)
 ; GFX9-DL-NEXT:    global_load_dword v1, v0, s[4:5]
 ; GFX9-DL-NEXT:    global_load_dword v2, v0, s[6:7]
 ; GFX9-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX9-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
+; GFX9-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX9-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX9-DL-NEXT:    s_mov_b32 s14, -1
 ; GFX9-DL-NEXT:    s_mov_b32 s15, 0xe00000
 ; GFX9-DL-NEXT:    s_add_u32 s12, s12, s9
 ; GFX9-DL-NEXT:    s_addc_u32 s13, s13, 0
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX9-DL-NEXT:    v_and_b32_e32 v4, 15, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v5, v1, 4, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v7, v1, 12, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v7, 8, v1
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v8, 12, v1
 ; GFX9-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-DL-NEXT:    v_and_b32_e32 v11, 15, v2
-; GFX9-DL-NEXT:    v_bfe_u32 v12, v2, 4, 4
-; GFX9-DL-NEXT:    v_perm_b32 v6, v7, v6, s2
-; GFX9-DL-NEXT:    v_perm_b32 v7, v12, v11, s2
-; GFX9-DL-NEXT:    v_perm_b32 v4, v5, v4, s2
-; GFX9-DL-NEXT:    v_bfe_u32 v8, v1, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v9, v1, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v13, v2, 8, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v14, v2, 12, 4
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v7
-; GFX9-DL-NEXT:    v_perm_b32 v8, v9, v8, s2
-; GFX9-DL-NEXT:    v_perm_b32 v9, v14, v13, s2
-; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-DL-NEXT:    v_add_u16_e32 v3, v4, v3
-; GFX9-DL-NEXT:    v_bfe_u32 v10, v1, 24, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v13, 4, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX9-DL-NEXT:    v_and_b32_e32 v12, 15, v2
+; GFX9-DL-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX9-DL-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX9-DL-NEXT:    v_and_b32_e32 v8, 15, v8
+; GFX9-DL-NEXT:    v_and_b32_e32 v13, 15, v13
+; GFX9-DL-NEXT:    v_and_b32_sdwa v9, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v10, 20, v1
+; GFX9-DL-NEXT:    v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
-; GFX9-DL-NEXT:    v_bfe_u32 v15, v2, 16, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v16, v2, 20, 4
-; GFX9-DL-NEXT:    v_bfe_u32 v17, v2, 24, 4
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v14, 8, v2
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v15, 12, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v16, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v17, 20, v2
+; GFX9-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX9-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v9
-; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
-; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v17, s2
-; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v10, s2
-; GFX9-DL-NEXT:    v_perm_b32 v10, v16, v15, s2
-; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v5
+; GFX9-DL-NEXT:    v_perm_b32 v7, v8, v7, s2
+; GFX9-DL-NEXT:    v_perm_b32 v8, v13, v12, s2
+; GFX9-DL-NEXT:    v_perm_b32 v5, v6, v5, s2
+; GFX9-DL-NEXT:    v_and_b32_e32 v10, 15, v10
+; GFX9-DL-NEXT:    v_and_b32_e32 v14, 15, v14
+; GFX9-DL-NEXT:    v_and_b32_e32 v15, 15, v15
+; GFX9-DL-NEXT:    v_and_b32_e32 v17, 15, v17
+; GFX9-DL-NEXT:    v_perm_b32 v2, v2, v4, s2
+; GFX9-DL-NEXT:    v_perm_b32 v1, v1, v11, s2
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v8
 ; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v8, v10
+; GFX9-DL-NEXT:    v_perm_b32 v2, v17, v16, s2
+; GFX9-DL-NEXT:    v_perm_b32 v4, v10, v9, s2
+; GFX9-DL-NEXT:    v_perm_b32 v9, v15, v14, s2
+; GFX9-DL-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-DL-NEXT:    v_add_u16_e32 v3, v5, v3
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v2, v4, v2
+; GFX9-DL-NEXT:    v_pk_mul_lo_u16 v4, v7, v9
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v4
+; GFX9-DL-NEXT:    v_add_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v3, v3, v2
 ; GFX9-DL-NEXT:    v_add_u16_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
 ; GFX9-DL-NEXT:    v_add_u16_e32 v2, v2, v1
@@ -3021,6 +3246,7 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
 ; GFX10-DL-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
 ; GFX10-DL-NEXT:    v_lshlrev_b32_e32 v0, 2, v0
+; GFX10-DL-NEXT:    v_mov_b32_e32 v4, 15
 ; GFX10-DL-NEXT:    s_mov_b32 s12, SCRATCH_RSRC_DWORD0
 ; GFX10-DL-NEXT:    s_mov_b32 s13, SCRATCH_RSRC_DWORD1
 ; GFX10-DL-NEXT:    s_mov_b32 s14, -1
@@ -3034,45 +3260,53 @@ define amdgpu_kernel void @udot8_acc4_vecMul(ptr addrspace(1) %src1,
 ; GFX10-DL-NEXT:    v_mov_b32_e32 v0, 0
 ; GFX10-DL-NEXT:    global_load_ubyte v3, v0, s[0:1]
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(2)
-; GFX10-DL-NEXT:    v_and_b32_e32 v4, 15, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v7, 4, v1
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v2
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 4, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v8, v2, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v1, 12, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v10, v1, 20, 4
-; GFX10-DL-NEXT:    v_perm_b32 v5, v6, v5, 0x5040100
-; GFX10-DL-NEXT:    v_perm_b32 v4, v7, v4, 0x5040100
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v1, 8, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v2, 8, 4
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v4, v5
-; GFX10-DL-NEXT:    v_perm_b32 v6, v9, v6, 0x5040100
-; GFX10-DL-NEXT:    v_perm_b32 v7, v8, v7, 0x5040100
-; GFX10-DL-NEXT:    v_bfe_u32 v5, v1, 16, 4
-; GFX10-DL-NEXT:    v_bfe_u32 v9, v2, 20, 4
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 4, v2
+; GFX10-DL-NEXT:    v_and_b32_e32 v5, 15, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 8, v1
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v7
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 12, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v12, 8, v2
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v13, 12, v2
+; GFX10-DL-NEXT:    v_perm_b32 v5, v7, v5, 0x5040100
+; GFX10-DL-NEXT:    v_perm_b32 v6, v6, v10, 0x5040100
+; GFX10-DL-NEXT:    v_and_b32_e32 v7, 15, v8
+; GFX10-DL-NEXT:    v_and_b32_e32 v8, 15, v12
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v13
+; GFX10-DL-NEXT:    v_and_b32_e32 v9, 15, v9
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v5, v6
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v6, 20, v1
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v12, 20, v2
+; GFX10-DL-NEXT:    v_perm_b32 v8, v10, v8, 0x5040100
+; GFX10-DL-NEXT:    v_perm_b32 v7, v9, v7, 0x5040100
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v5
 ; GFX10-DL-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v4, v3
-; GFX10-DL-NEXT:    v_bfe_u32 v4, v2, 16, 4
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v6, v6, v7
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v5, v3
+; GFX10-DL-NEXT:    v_and_b32_sdwa v11, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_sdwa v5, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX10-DL-NEXT:    v_and_b32_e32 v10, 15, v12
+; GFX10-DL-NEXT:    v_and_b32_e32 v6, 15, v6
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v7, v7, v8
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT:    v_and_b32_sdwa v8, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_perm_b32 v5, v10, v5, 0x5040100
-; GFX10-DL-NEXT:    v_bfe_u32 v7, v1, 24, 4
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NEXT:    v_perm_b32 v4, v9, v4, 0x5040100
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v8, 16, v6
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v6
-; GFX10-DL-NEXT:    v_bfe_u32 v6, v2, 24, 4
+; GFX10-DL-NEXT:    v_perm_b32 v6, v6, v11, 0x5040100
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v9, 16, v7
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v7
+; GFX10-DL-NEXT:    v_and_b32_sdwa v4, v2, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v2, 28, v2
-; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v4, v5, v4
-; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v7, 0x5040100
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v8
-; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v6, 0x5040100
-; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v5, 16, v4
-; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v4
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v1, 28, v1
+; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v5, v6, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v9
+; GFX10-DL-NEXT:    v_perm_b32 v2, v2, v4, 0x5040100
+; GFX10-DL-NEXT:    v_perm_b32 v1, v1, v8, 0x5040100
+; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v4, 16, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v3, v3, v5
 ; GFX10-DL-NEXT:    v_pk_mul_lo_u16 v1, v1, v2
-; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v5
+; GFX10-DL-NEXT:    v_add_nc_u16 v2, v3, v4
 ; GFX10-DL-NEXT:    v_lshrrev_b32_e32 v3, 16, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v2, v1
 ; GFX10-DL-NEXT:    v_add_nc_u16 v1, v1, v3
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
index cef959f45437db..86dad05db81f9f 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf-kill.ll
@@ -6,24 +6,26 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
 ; SI-LABEL: uniform_kill:
 ; SI:       ; %bb.0: ; %entry
 ; SI-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; SI-NEXT:    s_mov_b64 s[0:1], exec
-; SI-NEXT:    s_mov_b64 s[2:3], -1
-; SI-NEXT:    v_or_b32_e32 v0, v1, v0
+; SI-NEXT:    v_and_b32_e32 v1, 1, v1
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; SI-NEXT:    s_mov_b64 s[2:3], exec
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
-; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; SI-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; SI-NEXT:    s_or_b64 s[6:7], vcc, s[0:1]
+; SI-NEXT:    s_mov_b64 s[4:5], -1
+; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[6:7]
 ; SI-NEXT:  ; %bb.1: ; %if1
-; SI-NEXT:    s_xor_b64 s[2:3], exec, -1
+; SI-NEXT:    s_xor_b64 s[4:5], exec, -1
 ; SI-NEXT:  ; %bb.2: ; %endif1
-; SI-NEXT:    s_or_b64 exec, exec, s[4:5]
-; SI-NEXT:    s_wqm_b64 s[4:5], s[2:3]
-; SI-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
-; SI-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; SI-NEXT:    s_or_b64 exec, exec, s[0:1]
+; SI-NEXT:    s_wqm_b64 s[0:1], s[4:5]
+; SI-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; SI-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; SI-NEXT:    s_cbranch_scc0 .LBB0_6
 ; SI-NEXT:  ; %bb.3: ; %endif1
-; SI-NEXT:    s_and_b64 exec, exec, s[0:1]
+; SI-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; SI-NEXT:    v_mov_b32_e32 v0, 0
-; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; SI-NEXT:    s_and_saveexec_b64 s[0:1], s[4:5]
 ; SI-NEXT:    s_cbranch_execz .LBB0_5
 ; SI-NEXT:  ; %bb.4: ; %if2
 ; SI-NEXT:    s_mov_b32 s3, 0
@@ -48,24 +50,26 @@ define amdgpu_ps float @uniform_kill(float %a, i32 %b, float %c) {
 ; FLAT-LABEL: uniform_kill:
 ; FLAT:       ; %bb.0: ; %entry
 ; FLAT-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; FLAT-NEXT:    s_mov_b64 s[0:1], exec
-; FLAT-NEXT:    s_mov_b64 s[2:3], -1
-; FLAT-NEXT:    v_or_b32_e32 v0, v1, v0
+; FLAT-NEXT:    v_and_b32_e32 v1, 1, v1
+; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v1
+; FLAT-NEXT:    s_mov_b64 s[2:3], exec
 ; FLAT-NEXT:    v_and_b32_e32 v0, 1, v0
-; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
-; FLAT-NEXT:    s_and_saveexec_b64 s[4:5], vcc
+; FLAT-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v0
+; FLAT-NEXT:    s_or_b64 s[6:7], vcc, s[0:1]
+; FLAT-NEXT:    s_mov_b64 s[4:5], -1
+; FLAT-NEXT:    s_and_saveexec_b64 s[0:1], s[6:7]
 ; FLAT-NEXT:  ; %bb.1: ; %if1
-; FLAT-NEXT:    s_xor_b64 s[2:3], exec, -1
+; FLAT-NEXT:    s_xor_b64 s[4:5], exec, -1
 ; FLAT-NEXT:  ; %bb.2: ; %endif1
-; FLAT-NEXT:    s_or_b64 exec, exec, s[4:5]
-; FLAT-NEXT:    s_wqm_b64 s[4:5], s[2:3]
-; FLAT-NEXT:    s_xor_b64 s[4:5], s[4:5], exec
-; FLAT-NEXT:    s_andn2_b64 s[0:1], s[0:1], s[4:5]
+; FLAT-NEXT:    s_or_b64 exec, exec, s[0:1]
+; FLAT-NEXT:    s_wqm_b64 s[0:1], s[4:5]
+; FLAT-NEXT:    s_xor_b64 s[0:1], s[0:1], exec
+; FLAT-NEXT:    s_andn2_b64 s[2:3], s[2:3], s[0:1]
 ; FLAT-NEXT:    s_cbranch_scc0 .LBB0_6
 ; FLAT-NEXT:  ; %bb.3: ; %endif1
-; FLAT-NEXT:    s_and_b64 exec, exec, s[0:1]
+; FLAT-NEXT:    s_and_b64 exec, exec, s[2:3]
 ; FLAT-NEXT:    v_mov_b32_e32 v0, 0
-; FLAT-NEXT:    s_and_saveexec_b64 s[0:1], s[2:3]
+; FLAT-NEXT:    s_and_saveexec_b64 s[0:1], s[4:5]
 ; FLAT-NEXT:    s_cbranch_execz .LBB0_5
 ; FLAT-NEXT:  ; %bb.4: ; %if2
 ; FLAT-NEXT:    s_mov_b32 s3, 0
diff --git a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
index 9f3596359a6625..c0c5477af3e364 100644
--- a/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
+++ b/llvm/test/CodeGen/AMDGPU/si-annotate-cf.ll
@@ -7,15 +7,17 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out,
 ; SI:       ; %bb.0: ; %main_body
 ; SI-NEXT:    s_load_dword s0, s[2:3], 0xb
 ; SI-NEXT:    v_mbcnt_lo_u32_b32_e64 v0, -1, 0
-; SI-NEXT:    s_waitcnt lgkmcnt(0)
-; SI-NEXT:    v_and_b32_e32 v0, s0, v0
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
 ; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitcmp1_b32 s0, 0
+; SI-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; SI-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
 ; SI-NEXT:    s_mov_b64 s[0:1], 0
 ; SI-NEXT:  .LBB0_1: ; %ENDIF
 ; SI-NEXT:    ; =>This Inner Loop Header: Depth=1
-; SI-NEXT:    s_and_b64 s[4:5], exec, vcc
-; SI-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; SI-NEXT:    s_and_b64 s[6:7], exec, s[4:5]
+; SI-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
 ; SI-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; SI-NEXT:    s_cbranch_execnz .LBB0_1
 ; SI-NEXT:  ; %bb.2: ; %ENDLOOP
@@ -32,15 +34,17 @@ define amdgpu_kernel void @break_inserted_outside_of_loop(ptr addrspace(1) %out,
 ; FLAT:       ; %bb.0: ; %main_body
 ; FLAT-NEXT:    s_load_dword s0, s[2:3], 0x2c
 ; FLAT-NEXT:    v_mbcnt_lo_u32_b32 v0, -1, 0
-; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
-; FLAT-NEXT:    v_and_b32_e32 v0, s0, v0
 ; FLAT-NEXT:    v_and_b32_e32 v0, 1, v0
 ; FLAT-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; FLAT-NEXT:    s_waitcnt lgkmcnt(0)
+; FLAT-NEXT:    s_bitcmp1_b32 s0, 0
+; FLAT-NEXT:    s_cselect_b64 s[0:1], -1, 0
+; FLAT-NEXT:    s_and_b64 s[4:5], s[0:1], vcc
 ; FLAT-NEXT:    s_mov_b64 s[0:1], 0
 ; FLAT-NEXT:  .LBB0_1: ; %ENDIF
 ; FLAT-NEXT:    ; =>This Inner Loop Header: Depth=1
-; FLAT-NEXT:    s_and_b64 s[4:5], exec, vcc
-; FLAT-NEXT:    s_or_b64 s[0:1], s[4:5], s[0:1]
+; FLAT-NEXT:    s_and_b64 s[6:7], exec, s[4:5]
+; FLAT-NEXT:    s_or_b64 s[0:1], s[6:7], s[0:1]
 ; FLAT-NEXT:    s_andn2_b64 exec, exec, s[0:1]
 ; FLAT-NEXT:    s_cbranch_execnz .LBB0_1
 ; FLAT-NEXT:  ; %bb.2: ; %ENDLOOP
diff --git a/llvm/test/CodeGen/AMDGPU/xor.ll b/llvm/test/CodeGen/AMDGPU/xor.ll
index 9fac17f33d0d36..47028a1c11eaa4 100644
--- a/llvm/test/CodeGen/AMDGPU/xor.ll
+++ b/llvm/test/CodeGen/AMDGPU/xor.ll
@@ -182,8 +182,12 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
 ; SI-NEXT:    s_waitcnt vmcnt(0)
 ; SI-NEXT:    s_mov_b32 s8, s4
 ; SI-NEXT:    s_mov_b32 s9, s5
-; SI-NEXT:    v_xor_b32_e32 v0, v0, v1
 ; SI-NEXT:    v_and_b32_e32 v0, 1, v0
+; SI-NEXT:    v_and_b32_e32 v1, 1, v1
+; SI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v0
+; SI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v1
+; SI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; SI-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s[0:1]
 ; SI-NEXT:    buffer_store_byte v0, off, s[8:11], 0
 ; SI-NEXT:    s_endpgm
 ;
@@ -202,8 +206,12 @@ define amdgpu_kernel void @v_xor_i1(ptr addrspace(1) %out, ptr addrspace(1) %in0
 ; VI-NEXT:    s_waitcnt vmcnt(0)
 ; VI-NEXT:    v_mov_b32_e32 v0, s4
 ; VI-NEXT:    v_mov_b32_e32 v1, s5
-; VI-NEXT:    v_xor_b32_e32 v2, v4, v2
+; VI-NEXT:    v_and_b32_e32 v3, 1, v4
 ; VI-NEXT:    v_and_b32_e32 v2, 1, v2
+; VI-NEXT:    v_cmp_eq_u32_e32 vcc, 1, v3
+; VI-NEXT:    v_cmp_eq_u32_e64 s[0:1], 1, v2
+; VI-NEXT:    s_xor_b64 s[0:1], vcc, s[0:1]
+; VI-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[0:1]
 ; VI-NEXT:    flat_store_byte v[0:1], v2
 ; VI-NEXT:    s_endpgm
   %a = load volatile i1, ptr addrspace(1) %in0

>From f4a04556adf1735e6ccf7ddf7359fc1a92fff0f5 Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 24 Sep 2024 15:41:44 -0700
Subject: [PATCH 55/56] Pass the correct SDNode to isNarrowingProfitable in
 DAGCombiner

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 805ab4b035097b..d5b3fc00f3eedc 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -15532,7 +15532,7 @@ SDValue DAGCombiner::visitTRUNCATE(SDNode *N) {
     break;
   }
 
-  if (TLI.isNarrowingProfitable(N, SrcVT, VT)) {
+  if (TLI.isNarrowingProfitable(N0.getNode(), SrcVT, VT)) {
     switch (N0.getOpcode()) {
     case ISD::ADD:
     case ISD::SUB:

>From c58575619fd2e19af2b2321393c59b843935c87e Mon Sep 17 00:00:00 2001
From: Justin Fargnoli <jfargnoli at nvidia.com>
Date: Tue, 24 Sep 2024 15:41:59 -0700
Subject: [PATCH 56/56] Update AMDGPU tests

---
 llvm/test/CodeGen/AMDGPU/and.ll        |  12 +-
 llvm/test/CodeGen/AMDGPU/permute_i8.ll | 597 +++++++++++++------------
 2 files changed, 315 insertions(+), 294 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/and.ll b/llvm/test/CodeGen/AMDGPU/and.ll
index c6233642110eab..d332e9e04670bc 100644
--- a/llvm/test/CodeGen/AMDGPU/and.ll
+++ b/llvm/test/CodeGen/AMDGPU/and.ll
@@ -176,11 +176,13 @@ define amdgpu_kernel void @s_and_i64(ptr addrspace(1) %out, i64 %a, i64 %b) {
 
 ; FUNC-LABEL: {{^}}s_and_i1:
 ; SI: s_load_dword [[LOAD:s[0-9]+]]
-; SI: s_lshr_b32 [[B_SHIFT:s[0-9]+]], [[LOAD]], 8
-; SI: s_and_b32 [[AND:s[0-9]+]], [[LOAD]], [[B_SHIFT]]
-; SI: s_and_b32 [[AND_TRUNC:s[0-9]+]], [[AND]], 1{{$}}
-; SI: v_mov_b32_e32 [[V_AND_TRUNC:v[0-9]+]], [[AND_TRUNC]]
-; SI: buffer_store_byte [[V_AND_TRUNC]]
+; SI: s_bitcmp1_b32 [[LOAD]], 0
+; SI: s_cselect_b64 [[SEL1:s\[[0-9]+:[0-9]+\]]], -1, 0
+; SI: s_bitcmp1_b32 [[LOAD]], 8
+; SI: s_cselect_b64 [[SEL2:s\[[0-9]+:[0-9]+\]]], -1, 0
+; SI: s_and_b64 [[SEL1]], [[SEL1]], [[SEL2]]
+; SI: v_cndmask_b32_e64 [[V:v[0-9]+]], 0, 1, [[SEL1]]
+; SI: buffer_store_byte [[V]]
 define amdgpu_kernel void @s_and_i1(ptr addrspace(1) %out, i1 %a, i1 %b) {
   %and = and i1 %a, %b
   store i1 %and, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/permute_i8.ll b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
index 57f5473749513f..e7e3cfca58b9cc 100644
--- a/llvm/test/CodeGen/AMDGPU/permute_i8.ll
+++ b/llvm/test/CodeGen/AMDGPU/permute_i8.ll
@@ -1508,59 +1508,61 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    global_load_dword v4, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v9, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v1, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
 ; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v15, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v16, v10
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v12
-; GFX10-NEXT:    v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v14
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v15, v1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; GFX10-NEXT:    v_xor_b32_sdwa v3, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v12
 ; GFX10-NEXT:    v_xor_b32_sdwa v13, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:BYTE_3
-; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
-; GFX10-NEXT:    v_mul_f32_e32 v15, v2, v15
-; GFX10-NEXT:    v_mul_f32_e32 v16, v19, v16
+; GFX10-NEXT:    v_xor_b32_sdwa v0, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_2
 ; GFX10-NEXT:    v_ashrrev_i32_e32 v3, 30, v3
+; GFX10-NEXT:    v_mul_f32_e32 v16, v19, v16
+; GFX10-NEXT:    v_mul_f32_e32 v18, v1, v18
+; GFX10-NEXT:    v_mul_f32_e32 v15, v2, v15
+; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 30, v13
+; GFX10-NEXT:    v_or_b32_e32 v3, 1, v3
+; GFX10-NEXT:    v_trunc_f32_e32 v16, v16
+; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
 ; GFX10-NEXT:    v_mul_f32_e32 v17, v2, v17
-; GFX10-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_trunc_f32_e32 v15, v15
-; GFX10-NEXT:    v_trunc_f32_e32 v16, v16
-; GFX10-NEXT:    v_mul_f32_e32 v18, v1, v18
+; GFX10-NEXT:    v_ashrrev_i32_e32 v0, 30, v0
+; GFX10-NEXT:    v_mad_f32 v19, -v16, v10, v19
+; GFX10-NEXT:    v_mad_f32 v21, -v18, v14, v1
+; GFX10-NEXT:    v_or_b32_e32 v13, 1, v13
 ; GFX10-NEXT:    v_trunc_f32_e32 v17, v17
-; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
 ; GFX10-NEXT:    v_mad_f32 v20, -v15, v1, v2
-; GFX10-NEXT:    v_mad_f32 v19, -v16, v10, v19
-; GFX10-NEXT:    v_or_b32_e32 v3, 1, v3
-; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
+; GFX10-NEXT:    v_or_b32_e32 v0, 1, v0
 ; GFX10-NEXT:    v_mad_f32 v2, -v17, v12, v2
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
-; GFX10-NEXT:    v_ashrrev_i32_e32 v13, 30, v13
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v16, v16
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v3, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
 ; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11
-; GFX10-NEXT:    v_mad_f32 v21, -v18, v14, v1
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v15, v15
-; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, |v10|
-; GFX10-NEXT:    v_or_b32_e32 v13, 1, v13
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v16, v16
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v3, vcc_lo
+; GFX10-NEXT:    v_cndmask_b32_e32 v10, 0, v13, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, |v1|
+; GFX10-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc_lo
 ; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v2|, |v12|
-; GFX10-NEXT:    v_add_nc_u32_e32 v0, v15, v0
-; GFX10-NEXT:    v_add_nc_u32_sdwa v1, v16, v1 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v14|
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v17, v2
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v13, vcc_lo
-; GFX10-NEXT:    v_add_nc_u32_sdwa v3, v18, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_add_nc_u16 v2, v16, v3
+; GFX10-NEXT:    v_add_nc_u16 v3, v18, v10
+; GFX10-NEXT:    v_add_nc_u16 v0, v15, v0
+; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v11, vcc_lo
+; GFX10-NEXT:    v_lshlrev_b16 v2, 8, v2
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX10-NEXT:    v_add_nc_u16 v1, v17, v1
+; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x60706
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
@@ -1628,10 +1630,10 @@ define hidden void @sdiv_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v9, vcc
 ; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v20|, |v4|
 ; GFX9-NEXT:    v_cndmask_b32_e32 v4, 0, v14, vcc
-; GFX9-NEXT:    v_add_u32_e32 v1, v15, v1
-; GFX9-NEXT:    v_add_u32_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_add_u32_e32 v3, v17, v3
-; GFX9-NEXT:    v_add_u32_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_e32 v1, v15, v1
+; GFX9-NEXT:    v_add_u16_sdwa v2, v16, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_e32 v3, v17, v3
+; GFX9-NEXT:    v_add_u16_sdwa v4, v18, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -1862,79 +1864,83 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GFX10-NEXT:    v_and_b32_e32 v4, 0x3ff, v31
 ; GFX10-NEXT:    v_lshlrev_b32_e32 v4, 2, v4
-; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
 ; GFX10-NEXT:    v_add_co_u32 v0, vcc_lo, v0, v4
 ; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo
-; GFX10-NEXT:    global_load_dword v4, v[2:3], off
-; GFX10-NEXT:    global_load_dword v9, v[0:1], off
+; GFX10-NEXT:    v_add_co_u32 v2, vcc_lo, v2, v4
+; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo
+; GFX10-NEXT:    global_load_dword v4, v[0:1], off
+; GFX10-NEXT:    global_load_dword v9, v[2:3], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT:    s_waitcnt vmcnt(0)
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v17, v2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v18, v13
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    s_waitcnt vmcnt(0)
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v16, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT:    v_cvt_f32_i32_sdwa v23, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v20, v14
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v22, v16
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v19, v3
-; GFX10-NEXT:    v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v20, v15
-; GFX10-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX10-NEXT:    v_cvt_f32_i32_sdwa v21, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX10-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
-; GFX10-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
-; GFX10-NEXT:    v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
-; GFX10-NEXT:    v_mul_f32_e32 v17, v3, v17
-; GFX10-NEXT:    v_mul_f32_e32 v18, v12, v18
-; GFX10-NEXT:    v_mul_f32_e32 v19, v15, v19
-; GFX10-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
-; GFX10-NEXT:    v_or_b32_e32 v1, 1, v1
-; GFX10-NEXT:    v_trunc_f32_e32 v17, v17
-; GFX10-NEXT:    v_trunc_f32_e32 v18, v18
-; GFX10-NEXT:    v_mul_f32_e32 v20, v21, v20
-; GFX10-NEXT:    v_trunc_f32_e32 v19, v19
-; GFX10-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
-; GFX10-NEXT:    v_mad_f32 v22, -v17, v2, v3
-; GFX10-NEXT:    v_mad_f32 v12, -v18, v13, v12
-; GFX10-NEXT:    v_or_b32_e32 v11, 1, v11
+; GFX10-NEXT:    v_xor_b32_sdwa v12, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v21, v10
+; GFX10-NEXT:    v_xor_b32_sdwa v18, sext(v4), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
+; GFX10-NEXT:    v_xor_b32_sdwa v2, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX10-NEXT:    v_xor_b32_sdwa v15, sext(v9), sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
+; GFX10-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 8, v9
+; GFX10-NEXT:    v_mul_f32_e32 v20, v13, v20
+; GFX10-NEXT:    v_mul_f32_e32 v22, v23, v22
+; GFX10-NEXT:    v_mul_f32_e32 v19, v10, v19
+; GFX10-NEXT:    v_ashrrev_i32_e32 v18, 30, v18
+; GFX10-NEXT:    v_or_b32_e32 v12, 1, v12
 ; GFX10-NEXT:    v_trunc_f32_e32 v20, v20
-; GFX10-NEXT:    v_mad_f32 v23, -v19, v3, v15
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v22|, |v2|
-; GFX10-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
-; GFX10-NEXT:    v_or_b32_e32 v14, 1, v14
-; GFX10-NEXT:    v_mad_f32 v21, -v20, v15, v21
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v17, v17
-; GFX10-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v12|, |v13|
-; GFX10-NEXT:    v_or_b32_e32 v16, 1, v16
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v18, v18
-; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19
+; GFX10-NEXT:    v_trunc_f32_e32 v22, v22
+; GFX10-NEXT:    v_mul_f32_e32 v21, v16, v21
+; GFX10-NEXT:    v_trunc_f32_e32 v19, v19
+; GFX10-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
+; GFX10-NEXT:    v_mad_f32 v13, -v20, v14, v13
+; GFX10-NEXT:    v_mad_f32 v23, -v22, v16, v23
+; GFX10-NEXT:    v_or_b32_e32 v18, 1, v18
+; GFX10-NEXT:    v_trunc_f32_e32 v21, v21
+; GFX10-NEXT:    v_mad_f32 v24, -v19, v3, v10
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v13|, |v14|
+; GFX10-NEXT:    v_ashrrev_i32_e32 v15, 30, v15
+; GFX10-NEXT:    v_or_b32_e32 v2, 1, v2
+; GFX10-NEXT:    v_mad_f32 v25, -v21, v10, v16
 ; GFX10-NEXT:    v_cvt_i32_f32_e32 v20, v20
-; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v23|, |v3|
+; GFX10-NEXT:    v_cndmask_b32_e32 v12, 0, v12, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v23|, |v16|
+; GFX10-NEXT:    v_or_b32_e32 v15, 1, v15
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v22, v22
+; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 24, v9
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v19, v19
+; GFX10-NEXT:    v_cndmask_b32_e32 v13, 0, v18, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v24|, |v3|
+; GFX10-NEXT:    v_cvt_i32_f32_e32 v21, v21
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
-; GFX10-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v1, v17, v1
-; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v14, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v21|, |v15|
-; GFX10-NEXT:    v_add_nc_u32_e32 v2, v18, v2
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v4
-; GFX10-NEXT:    v_add_nc_u32_e32 v3, v19, v3
-; GFX10-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc_lo
-; GFX10-NEXT:    v_mul_lo_u32 v2, v2, v10
-; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v0
-; GFX10-NEXT:    v_add_nc_u32_e32 v11, v20, v11
-; GFX10-NEXT:    v_sub_nc_u32_e32 v0, v0, v1
-; GFX10-NEXT:    v_sub_nc_u32_sdwa v1, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_lo_u32 v10, v11, v12
-; GFX10-NEXT:    v_sub_nc_u32_e32 v2, v12, v3
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v1, v2, v3 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_or_b32_sdwa v0, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v1, v4, v9, 0x2070306
+; GFX10-NEXT:    v_lshrrev_b32_e32 v1, 16, v9
+; GFX10-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc_lo
+; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v25|, |v10|
+; GFX10-NEXT:    v_add_nc_u16 v10, v20, v12
+; GFX10-NEXT:    v_add_nc_u16 v12, v22, v13
+; GFX10-NEXT:    v_lshrrev_b32_e32 v13, 24, v4
+; GFX10-NEXT:    v_add_nc_u16 v2, v19, v2
+; GFX10-NEXT:    v_cndmask_b32_e32 v3, 0, v15, vcc_lo
+; GFX10-NEXT:    v_mul_lo_u16 v10, v10, v11
+; GFX10-NEXT:    v_mul_lo_u16 v11, v12, v17
+; GFX10-NEXT:    v_mul_lo_u16 v2, v2, v9
+; GFX10-NEXT:    v_add_nc_u16 v3, v21, v3
+; GFX10-NEXT:    v_sub_nc_u16 v10, v13, v10
+; GFX10-NEXT:    v_sub_nc_u16 v0, v0, v11
+; GFX10-NEXT:    v_mul_lo_u16 v3, v3, v1
+; GFX10-NEXT:    v_sub_nc_u16 v1, v1, v2
+; GFX10-NEXT:    v_lshlrev_b16 v0, 8, v0
+; GFX10-NEXT:    v_sub_nc_u16 v2, v17, v3
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v10
+; GFX10-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_or_b32_sdwa v0, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX10-NEXT:    v_perm_b32 v1, v9, v4, 0x2070306
 ; GFX10-NEXT:    global_store_dword v[5:6], v0, off
 ; GFX10-NEXT:    global_store_dword v[7:8], v1, off
 ; GFX10-NEXT:    s_setpc_b64 s[30:31]
@@ -1952,74 +1958,72 @@ define hidden void @srem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    global_load_dword v9, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2070306
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v14, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v10, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v16, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v2, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v3, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v15, sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v2
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v12, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v19, v13
+; GFX9-NEXT:    v_cvt_f32_i32_sdwa v17, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v20, v3
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v13, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v21, v14
-; GFX9-NEXT:    v_cvt_f32_i32_sdwa v19, sext(v9) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v22, v10
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v23, v16
-; GFX9-NEXT:    v_mul_f32_e32 v20, v10, v20
-; GFX9-NEXT:    v_xor_b32_sdwa v2, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
-; GFX9-NEXT:    v_mul_f32_e32 v21, v13, v21
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v21, v15
+; GFX9-NEXT:    v_mul_f32_e32 v18, v3, v18
+; GFX9-NEXT:    v_xor_b32_sdwa v1, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_0
+; GFX9-NEXT:    v_mul_f32_e32 v19, v12, v19
+; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
+; GFX9-NEXT:    v_xor_b32_sdwa v11, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
+; GFX9-NEXT:    v_ashrrev_i32_e32 v1, 30, v1
+; GFX9-NEXT:    v_mul_f32_e32 v20, v15, v20
+; GFX9-NEXT:    v_mul_f32_e32 v21, v17, v21
+; GFX9-NEXT:    v_trunc_f32_e32 v19, v19
+; GFX9-NEXT:    v_mad_f32 v22, -v18, v2, v3
+; GFX9-NEXT:    v_xor_b32_sdwa v14, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
+; GFX9-NEXT:    v_ashrrev_i32_e32 v11, 30, v11
+; GFX9-NEXT:    v_or_b32_e32 v1, 1, v1
 ; GFX9-NEXT:    v_trunc_f32_e32 v20, v20
-; GFX9-NEXT:    v_xor_b32_sdwa v12, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_1
-; GFX9-NEXT:    v_ashrrev_i32_e32 v2, 30, v2
-; GFX9-NEXT:    v_mul_f32_e32 v22, v16, v22
-; GFX9-NEXT:    v_mul_f32_e32 v23, v19, v23
 ; GFX9-NEXT:    v_trunc_f32_e32 v21, v21
-; GFX9-NEXT:    v_mad_f32 v24, -v20, v3, v10
-; GFX9-NEXT:    v_xor_b32_sdwa v15, sext(v4), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:BYTE_2
-; GFX9-NEXT:    v_ashrrev_i32_e32 v12, 30, v12
-; GFX9-NEXT:    v_or_b32_e32 v2, 1, v2
-; GFX9-NEXT:    v_trunc_f32_e32 v22, v22
-; GFX9-NEXT:    v_trunc_f32_e32 v23, v23
-; GFX9-NEXT:    v_mad_f32 v13, -v21, v14, v13
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v24|, |v3|
-; GFX9-NEXT:    v_xor_b32_sdwa v18, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
-; GFX9-NEXT:    v_ashrrev_i32_e32 v15, 30, v15
-; GFX9-NEXT:    v_or_b32_e32 v12, 1, v12
+; GFX9-NEXT:    v_mad_f32 v12, -v19, v13, v12
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v22|, |v2|
+; GFX9-NEXT:    v_xor_b32_sdwa v16, sext(v9), sext(v4) dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 src1_sel:BYTE_3
+; GFX9-NEXT:    v_ashrrev_i32_e32 v14, 30, v14
+; GFX9-NEXT:    v_or_b32_e32 v11, 1, v11
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v18, v18
+; GFX9-NEXT:    v_cvt_i32_f32_e32 v19, v19
+; GFX9-NEXT:    v_mad_f32 v23, -v20, v3, v15
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v20, v20
+; GFX9-NEXT:    v_mad_f32 v17, -v21, v15, v17
 ; GFX9-NEXT:    v_cvt_i32_f32_e32 v21, v21
-; GFX9-NEXT:    v_mad_f32 v25, -v22, v10, v16
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v22, v22
-; GFX9-NEXT:    v_mad_f32 v19, -v23, v16, v19
-; GFX9-NEXT:    v_cvt_i32_f32_e32 v23, v23
-; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v2, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, |v14|
-; GFX9-NEXT:    v_ashrrev_i32_e32 v18, 30, v18
-; GFX9-NEXT:    v_or_b32_e32 v15, 1, v15
-; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v12, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v25|, |v10|
-; GFX9-NEXT:    v_or_b32_e32 v18, 1, v18
-; GFX9-NEXT:    v_cndmask_b32_e32 v10, 0, v15, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, |v16|
-; GFX9-NEXT:    v_cndmask_b32_e32 v12, 0, v18, vcc
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v11, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v17, 24, v4
-; GFX9-NEXT:    v_add_u32_e32 v2, v20, v2
-; GFX9-NEXT:    v_add_u32_e32 v3, v21, v3
-; GFX9-NEXT:    v_add_u32_e32 v10, v22, v10
-; GFX9-NEXT:    v_add_u32_e32 v12, v23, v12
-; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v11
-; GFX9-NEXT:    v_mul_lo_u32 v4, v10, v0
-; GFX9-NEXT:    v_mul_lo_u32 v10, v12, v17
-; GFX9-NEXT:    v_sub_u32_e32 v0, v0, v2
-; GFX9-NEXT:    v_sub_u32_sdwa v2, v9, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
-; GFX9-NEXT:    v_sub_u32_e32 v3, v17, v4
-; GFX9-NEXT:    v_sub_u32_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v12|, |v13|
+; GFX9-NEXT:    v_ashrrev_i32_e32 v16, 30, v16
+; GFX9-NEXT:    v_or_b32_e32 v14, 1, v14
+; GFX9-NEXT:    v_cndmask_b32_e32 v2, 0, v11, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v23|, |v3|
+; GFX9-NEXT:    v_or_b32_e32 v16, 1, v16
+; GFX9-NEXT:    v_cndmask_b32_e32 v3, 0, v14, vcc
+; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v17|, |v15|
+; GFX9-NEXT:    v_cndmask_b32_e32 v11, 0, v16, vcc
+; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
+; GFX9-NEXT:    v_add_u16_e32 v1, v18, v1
+; GFX9-NEXT:    v_add_u16_e32 v2, v19, v2
+; GFX9-NEXT:    v_add_u16_e32 v3, v20, v3
+; GFX9-NEXT:    v_add_u16_e32 v11, v21, v11
+; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v1, v4
+; GFX9-NEXT:    v_mul_lo_u16_e32 v2, v2, v10
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v3, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4
+; GFX9-NEXT:    v_sub_u16_sdwa v1, v4, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u16_sdwa v2, v9, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u16_sdwa v3, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_3 src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u16_sdwa v4, v9, v10 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    global_store_dword v[7:8], v1, off
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    global_store_dword v[5:6], v1, off
+; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()
@@ -2234,45 +2238,49 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
 ; GFX10-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
+; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v14, v0
-; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v15, v0
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v15, v0
 ; GFX10-NEXT:    v_perm_b32 v0, v0, v2, 0x40207
-; GFX10-NEXT:    v_mul_f32_e32 v10, v14, v10
 ; GFX10-NEXT:    v_mul_f32_e32 v11, v4, v11
 ; GFX10-NEXT:    v_mul_f32_e32 v13, v1, v13
+; GFX10-NEXT:    v_mul_f32_e32 v10, v14, v10
 ; GFX10-NEXT:    v_mul_f32_e32 v12, v15, v12
-; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
 ; GFX10-NEXT:    v_trunc_f32_e32 v11, v11
 ; GFX10-NEXT:    v_trunc_f32_e32 v13, v13
+; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
 ; GFX10-NEXT:    v_trunc_f32_e32 v12, v12
-; GFX10-NEXT:    v_mad_f32 v14, -v10, v1, v14
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GFX10-NEXT:    v_mad_f32 v16, -v11, v3, v4
 ; GFX10-NEXT:    v_mad_f32 v17, -v13, v9, v1
+; GFX10-NEXT:    v_mad_f32 v14, -v10, v1, v14
+; GFX10-NEXT:    v_mad_f32 v15, -v12, v4, v15
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v14|, v1
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v16|, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GFX10-NEXT:    v_mad_f32 v15, -v12, v4, v15
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v16|, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v17|, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v17|, v9
+; GFX10-NEXT:    v_add_nc_u16 v3, v11, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v14|, v1
 ; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v4
-; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_add_nc_u16 v9, v13, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v15|, v4
 ; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v9
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
+; GFX10-NEXT:    v_add_nc_u16 v1, v10, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX10-NEXT:    v_add_nc_u16 v4, v12, v4
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v1, off
@@ -2293,43 +2301,45 @@ define hidden void @udiv(ptr addrspace(1) %in0, ptr addrspace(1) %in1, i8 %elt,
 ; GFX9-NEXT:    s_mov_b32 s4, 0x40207
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v11, v2
 ; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v11, v2
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v1, v9
-; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v10, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v10
-; GFX9-NEXT:    v_mul_f32_e32 v11, v1, v11
 ; GFX9-NEXT:    v_perm_b32 v0, v9, v4, s4
+; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v10, v4
 ; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v4, v4
-; GFX9-NEXT:    v_trunc_f32_e32 v11, v11
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v12, v3
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v10
 ; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v4
-; GFX9-NEXT:    v_mul_f32_e32 v12, v10, v12
-; GFX9-NEXT:    v_mad_f32 v1, -v11, v2, v1
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v1, v9
+; GFX9-NEXT:    v_mul_f32_e32 v11, v1, v11
 ; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v9, v9
-; GFX9-NEXT:    v_trunc_f32_e32 v12, v12
+; GFX9-NEXT:    v_mul_f32_e32 v12, v10, v12
+; GFX9-NEXT:    v_trunc_f32_e32 v11, v11
 ; GFX9-NEXT:    v_mul_f32_e32 v13, v9, v13
+; GFX9-NEXT:    v_mul_f32_e32 v14, v2, v14
+; GFX9-NEXT:    v_trunc_f32_e32 v12, v12
+; GFX9-NEXT:    v_mad_f32 v1, -v11, v2, v1
+; GFX9-NEXT:    v_trunc_f32_e32 v13, v13
+; GFX9-NEXT:    v_trunc_f32_e32 v14, v14
 ; GFX9-NEXT:    v_mad_f32 v15, -v12, v3, v10
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v1|, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v11, v11
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v12, v12
-; GFX9-NEXT:    v_trunc_f32_e32 v13, v13
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v1|, v2
-; GFX9-NEXT:    v_mul_f32_e32 v14, v2, v14
 ; GFX9-NEXT:    v_mad_f32 v9, -v13, v10, v9
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GFX9-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v11, vcc
-; GFX9-NEXT:    v_trunc_f32_e32 v14, v14
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v15|, v3
 ; GFX9-NEXT:    v_mad_f32 v16, -v14, v4, v2
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v12, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v9|, v10
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v13, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v16|, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v14, vcc
-; GFX9-NEXT:    v_lshlrev_b16_e32 v2, 8, v2
-; GFX9-NEXT:    v_lshlrev_b16_e32 v4, 8, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v15|, v3
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v9|, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v16|, v4
+; GFX9-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s[4:5]
+; GFX9-NEXT:    v_add_u16_e32 v1, v11, v1
+; GFX9-NEXT:    v_add_u16_sdwa v2, v12, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_add_u16_e32 v3, v13, v3
+; GFX9-NEXT:    v_add_u16_sdwa v4, v14, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
@@ -2422,53 +2432,60 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX10-NEXT:    global_load_dword v2, v[2:3], off
 ; GFX10-NEXT:    global_load_dword v0, v[0:1], off
 ; GFX10-NEXT:    s_waitcnt vmcnt(1)
-; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
 ; GFX10-NEXT:    v_cvt_f32_ubyte1_e32 v3, v2
-; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
 ; GFX10-NEXT:    v_cvt_f32_ubyte3_e32 v9, v2
+; GFX10-NEXT:    v_cvt_f32_ubyte0_e32 v1, v2
+; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v4, v2
 ; GFX10-NEXT:    s_waitcnt vmcnt(0)
 ; GFX10-NEXT:    v_cvt_f32_ubyte2_e32 v15, v0
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v11, v3
-; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
 ; GFX10-NEXT:    v_rcp_iflag_f32_e32 v13, v9
-; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v2
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v10, v1
+; GFX10-NEXT:    v_rcp_iflag_f32_e32 v12, v4
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v16, 8, v2
 ; GFX10-NEXT:    v_lshrrev_b32_e32 v17, 24, v2
-; GFX10-NEXT:    v_mul_f32_e32 v10, v3, v10
+; GFX10-NEXT:    v_lshrrev_b32_e32 v14, 16, v0
+; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x2050505
 ; GFX10-NEXT:    v_mul_f32_e32 v11, v3, v11
-; GFX10-NEXT:    v_mul_f32_e32 v12, v3, v12
 ; GFX10-NEXT:    v_mul_f32_e32 v13, v15, v13
-; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
+; GFX10-NEXT:    v_mul_f32_e32 v10, v3, v10
+; GFX10-NEXT:    v_mul_f32_e32 v12, v3, v12
 ; GFX10-NEXT:    v_trunc_f32_e32 v11, v11
-; GFX10-NEXT:    v_trunc_f32_e32 v12, v12
 ; GFX10-NEXT:    v_trunc_f32_e32 v13, v13
-; GFX10-NEXT:    v_mad_f32 v18, -v10, v1, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GFX10-NEXT:    v_trunc_f32_e32 v10, v10
+; GFX10-NEXT:    v_trunc_f32_e32 v12, v12
 ; GFX10-NEXT:    v_mad_f32 v19, -v11, v3, v3
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
-; GFX10-NEXT:    v_mad_f32 v20, -v12, v4, v3
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v18|, v1
-; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12
 ; GFX10-NEXT:    v_mad_f32 v15, -v13, v9, v15
+; GFX10-NEXT:    v_mad_f32 v18, -v10, v1, v3
+; GFX10-NEXT:    v_mad_f32 v20, -v12, v4, v3
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v11, v11
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v19|, v3
 ; GFX10-NEXT:    v_cvt_u32_f32_e32 v13, v13
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v1, vcc_lo, 0, v10, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v19|, v3
-; GFX10-NEXT:    v_mul_lo_u32 v1, v1, v2
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v3, vcc_lo, 0, v11, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v20|, v4
-; GFX10-NEXT:    v_mul_lo_u32 v3, v3, v16
-; GFX10-NEXT:    v_sub_nc_u32_e32 v1, v16, v1
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v4, vcc_lo, 0, v12, vcc_lo
-; GFX10-NEXT:    v_cmp_ge_f32_e64 vcc_lo, |v15|, v9
-; GFX10-NEXT:    v_mul_lo_u32 v4, v4, v14
-; GFX10-NEXT:    v_sub_nc_u32_sdwa v3, v16, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX10-NEXT:    v_add_co_ci_u32_e32 v9, vcc_lo, 0, v13, vcc_lo
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v10, v10
+; GFX10-NEXT:    v_cvt_u32_f32_e32 v12, v12
+; GFX10-NEXT:    v_cndmask_b32_e64 v3, 0, 1, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v15|, v9
+; GFX10-NEXT:    v_add_nc_u16 v3, v11, v3
+; GFX10-NEXT:    v_cndmask_b32_e64 v9, 0, 1, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v18|, v1
+; GFX10-NEXT:    v_lshrrev_b32_e32 v11, 16, v2
+; GFX10-NEXT:    v_mul_lo_u16 v3, v3, v16
+; GFX10-NEXT:    v_add_nc_u16 v9, v13, v9
+; GFX10-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s4
+; GFX10-NEXT:    v_cmp_ge_f32_e64 s4, |v20|, v4
+; GFX10-NEXT:    v_sub_nc_u16 v3, v16, v3
+; GFX10-NEXT:    v_mul_lo_u16 v9, v9, v17
+; GFX10-NEXT:    v_add_nc_u16 v1, v10, v1
+; GFX10-NEXT:    v_cndmask_b32_e64 v4, 0, 1, s4
+; GFX10-NEXT:    v_lshlrev_b16 v3, 8, v3
+; GFX10-NEXT:    v_sub_nc_u16 v9, v14, v9
+; GFX10-NEXT:    v_mul_lo_u16 v1, v1, v2
+; GFX10-NEXT:    v_add_nc_u16 v4, v12, v4
+; GFX10-NEXT:    v_lshlrev_b16 v9, 8, v9
+; GFX10-NEXT:    v_sub_nc_u16 v1, v16, v1
+; GFX10-NEXT:    v_mul_lo_u16 v4, v4, v11
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX10-NEXT:    v_mul_lo_u32 v9, v9, v17
-; GFX10-NEXT:    v_sub_nc_u32_e32 v4, v16, v4
-; GFX10-NEXT:    v_sub_nc_u32_sdwa v9, v0, v9 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX10-NEXT:    v_perm_b32 v0, v2, v0, 0x2050505
+; GFX10-NEXT:    v_sub_nc_u16 v4, v16, v4
 ; GFX10-NEXT:    v_or_b32_sdwa v3, v4, v9 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
 ; GFX10-NEXT:    v_or_b32_sdwa v1, v1, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
 ; GFX10-NEXT:    global_store_dword v[5:6], v1, off
@@ -2488,57 +2505,59 @@ define hidden void @urem_store_div(ptr addrspace(1) %in0, ptr addrspace(1) %in1,
 ; GFX9-NEXT:    global_load_dword v9, v[0:1], off
 ; GFX9-NEXT:    s_mov_b32 s4, 0x2050505
 ; GFX9-NEXT:    s_waitcnt vmcnt(1)
-; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v2, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v2
-; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v3, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v3
-; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v11, v4
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v17, v11
-; GFX9-NEXT:    v_mul_f32_e32 v15, v3, v15
-; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v14, v4
+; GFX9-NEXT:    v_cvt_f32_ubyte0_e32 v1, v4
+; GFX9-NEXT:    v_cvt_f32_ubyte1_e32 v2, v4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v13, v1
+; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v10, v4
+; GFX9-NEXT:    v_cvt_f32_ubyte3_e32 v12, v4
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v14, v2
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v15, v10
+; GFX9-NEXT:    v_rcp_iflag_f32_e32 v16, v12
+; GFX9-NEXT:    v_mul_f32_e32 v13, v2, v13
+; GFX9-NEXT:    s_waitcnt vmcnt(0)
+; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v11, v9
+; GFX9-NEXT:    v_mul_f32_e32 v14, v2, v14
+; GFX9-NEXT:    v_trunc_f32_e32 v13, v13
+; GFX9-NEXT:    v_mul_f32_e32 v15, v2, v15
+; GFX9-NEXT:    v_mul_f32_e32 v16, v11, v16
+; GFX9-NEXT:    v_trunc_f32_e32 v14, v14
+; GFX9-NEXT:    v_mad_f32 v17, -v13, v1, v2
+; GFX9-NEXT:    v_perm_b32 v0, v4, v9, s4
 ; GFX9-NEXT:    v_trunc_f32_e32 v15, v15
-; GFX9-NEXT:    v_rcp_iflag_f32_e32 v18, v14
-; GFX9-NEXT:    v_mul_f32_e32 v16, v3, v16
-; GFX9-NEXT:    v_mad_f32 v19, -v15, v2, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
 ; GFX9-NEXT:    v_trunc_f32_e32 v16, v16
-; GFX9-NEXT:    v_mul_f32_e32 v17, v3, v17
-; GFX9-NEXT:    v_mad_f32 v20, -v16, v3, v3
+; GFX9-NEXT:    v_mad_f32 v18, -v14, v2, v2
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v17|, v1
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v13, v13
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v14, v14
+; GFX9-NEXT:    v_mad_f32 v19, -v15, v10, v2
+; GFX9-NEXT:    v_cvt_u32_f32_e32 v15, v15
+; GFX9-NEXT:    v_mad_f32 v11, -v16, v12, v11
 ; GFX9-NEXT:    v_cvt_u32_f32_e32 v16, v16
-; GFX9-NEXT:    s_waitcnt vmcnt(0)
-; GFX9-NEXT:    v_cvt_f32_ubyte2_e32 v13, v9
-; GFX9-NEXT:    v_trunc_f32_e32 v17, v17
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v19|, v2
-; GFX9-NEXT:    v_mul_f32_e32 v18, v13, v18
-; GFX9-NEXT:    v_mad_f32 v21, -v17, v11, v3
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v17, v17
-; GFX9-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v15, vcc
-; GFX9-NEXT:    v_trunc_f32_e32 v18, v18
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v20|, v3
-; GFX9-NEXT:    v_mad_f32 v13, -v18, v14, v13
-; GFX9-NEXT:    v_cvt_u32_f32_e32 v18, v18
-; GFX9-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v16, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v21|, v11
-; GFX9-NEXT:    v_addc_co_u32_e32 v11, vcc, 0, v17, vcc
-; GFX9-NEXT:    v_cmp_ge_f32_e64 vcc, |v13|, v14
-; GFX9-NEXT:    v_lshrrev_b32_e32 v0, 16, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v10, 8, v4
-; GFX9-NEXT:    v_lshrrev_b32_e32 v12, 24, v4
-; GFX9-NEXT:    v_addc_co_u32_e32 v13, vcc, 0, v18, vcc
-; GFX9-NEXT:    v_perm_b32 v1, v4, v9, s4
-; GFX9-NEXT:    v_mul_lo_u32 v2, v2, v4
-; GFX9-NEXT:    v_mul_lo_u32 v3, v3, v10
-; GFX9-NEXT:    v_mul_lo_u32 v0, v11, v0
-; GFX9-NEXT:    v_mul_lo_u32 v4, v13, v12
-; GFX9-NEXT:    v_sub_u32_e32 v2, v10, v2
-; GFX9-NEXT:    v_sub_u32_sdwa v3, v10, v3 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
-; GFX9-NEXT:    v_sub_u32_e32 v0, v10, v0
-; GFX9-NEXT:    v_sub_u32_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v0, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
-; GFX9-NEXT:    v_or_b32_sdwa v0, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
-; GFX9-NEXT:    global_store_dword v[5:6], v0, off
-; GFX9-NEXT:    global_store_dword v[7:8], v1, off
+; GFX9-NEXT:    v_cndmask_b32_e64 v1, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v18|, v2
+; GFX9-NEXT:    v_cndmask_b32_e64 v2, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v19|, v10
+; GFX9-NEXT:    v_cndmask_b32_e64 v10, 0, 1, s[4:5]
+; GFX9-NEXT:    v_cmp_ge_f32_e64 s[4:5], |v11|, v12
+; GFX9-NEXT:    v_cndmask_b32_e64 v11, 0, 1, s[4:5]
+; GFX9-NEXT:    v_lshrrev_b32_e32 v3, 8, v4
+; GFX9-NEXT:    v_add_u16_e32 v1, v13, v1
+; GFX9-NEXT:    v_add_u16_e32 v2, v14, v2
+; GFX9-NEXT:    v_add_u16_e32 v10, v15, v10
+; GFX9-NEXT:    v_add_u16_e32 v11, v16, v11
+; GFX9-NEXT:    v_mul_lo_u16_e32 v1, v1, v4
+; GFX9-NEXT:    v_mul_lo_u16_e32 v2, v2, v3
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v10, v10, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1
+; GFX9-NEXT:    v_mul_lo_u16_sdwa v4, v11, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_3
+; GFX9-NEXT:    v_sub_u16_e32 v1, v3, v1
+; GFX9-NEXT:    v_sub_u16_sdwa v2, v3, v2 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD
+; GFX9-NEXT:    v_sub_u16_e32 v3, v3, v10
+; GFX9-NEXT:    v_sub_u16_sdwa v4, v9, v4 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:WORD_1 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v2, v3, v4 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD
+; GFX9-NEXT:    v_or_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_0 src1_sel:DWORD
+; GFX9-NEXT:    global_store_dword v[5:6], v1, off
+; GFX9-NEXT:    global_store_dword v[7:8], v0, off
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_setpc_b64 s[30:31]
   %tid = call i32 @llvm.amdgcn.workitem.id.x()



More information about the llvm-commits mailing list